def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 2: log.error("Missing argument: <JSON config file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error("You must provide argument word_embedding or word_lexicon and word_emb_size") # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error("Only one of the parameters label_lexicon and labels can be provided!") exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error("One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) if args.conv_act: convOut = ActivationLayer(convLinear, tanh) else: convOut = convLinear # Max pooling layer. maxPooling = MaxPoolingLayer(convOut) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen(): log.error("Number of label weights (%d) is different from number of labels (%d)!" % ( len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if args.train: trainDatasetReader = ShortDocReader(args.train) if args.load_method == "sync": log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle) wordLexicon.stopAdd() elif args.load_method == "async": log.info("Examples will be asynchronously loaded.") trainIterator = AsyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle, maxqSize=1000) else: log.error("The argument 'load_method' has an invalid value: %s." % args.load_method) sys.exit(1) labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error("Argument eval_per_iteration cannot be used without a dev argument.") sys.exit(1) if dev: log.info("Reading development examples") devReader = ShortDocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, - 1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction), FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction), FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Saving model after training if args.save_wordEmbedding: embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon) log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding)) if args.save_conv: convLinear.save(args.save_conv) log.info("Saved convolution layer to file: %s" % (args.save_conv)) if args.save_hiddenLayer: hiddenLinear.save(args.save_hiddenLayer) log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer)) if args.save_softmax: sotmaxLinearInput.save(args.save_softmax) log.info("Saved softmax to file: %s" % (args.save_softmax)) # Testing if args.test: log.info("Reading test examples") testReader = ShortDocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, - 1, shuffle=False) log.info("Testing") model.test(testIterator)
def main(**kwargs): log = logging.getLogger(__name__) log.info(kwargs) if kwargs["seed"] != None: random.seed(kwargs["seed"]) np.random.seed(kwargs["seed"]) filters = [] for filterName in kwargs["filters"]: moduleName, className = filterName.rsplit('.', 1) log.info("Usando o filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) wordWindowSize = kwargs["word_window_size"] hiddenLayerSize = kwargs["hidden_size"] batchSize = kwargs["batch_size"] startSymbol = kwargs["start_symbol"] endSymbol = kwargs["end_symbol"] numEpochs = kwargs["num_epochs"] lr = kwargs["lr"] tagLexicon = createLexiconUsingFile(kwargs["label_file"]) # _lambda = theano.shared(kwargs["lambda"], "lambda") _lambda = theano.shared(0.0, "lambda") useAdagrad = kwargs["adagrad"] shuffle = kwargs["shuffle"] supHiddenLayerSize = kwargs["hidden_size_supervised_part"] unsupHiddenLayerSize = kwargs["hidden_size_unsupervised_part"] normalization = kwargs["normalization"] activationHiddenExtractor = kwargs["activation_hidden_extractor"] withCharWNN = kwargs["with_charwnn"] convSize = kwargs["conv_size"] charEmbeddingSize = kwargs["char_emb_size"] charWindowSize = kwargs["char_window_size"] startSymbolChar = "</s>" if kwargs["charwnn_with_act"]: charAct = tanh else: charAct = None # TODO: the maximum number of characters of word is fixed in 20. numMaxChar = 20 if kwargs["decay"].lower() == "normal": decay = 0.0 elif kwargs["decay"].lower() == "divide_epoch": decay = 1.0 # Add the lexicon of target domainLexicon = Lexicon() domainLexicon.put("0") domainLexicon.put("1") domainLexicon.stopAdd() log.info("Reading W2v File1") wordEmbedding = EmbeddingFactory().createFromW2V(kwargs["word_embedding"], RandomUnknownStrategy()) log.info("Reading training examples") # Generators inputGenerators = [ WordWindowGenerator(wordWindowSize, wordEmbedding, filters, startSymbol) ] outputGeneratorTag = LabelGenerator(tagLexicon) if withCharWNN: # Create the character embedding charEmbedding = EmbeddingFactory().createRandomEmbedding( charEmbeddingSize) # Insert the padding of the character window charEmbedding.put(startSymbolChar) # Insert the character that will be used to fill the matrix # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication. artificialChar = "ART_CHAR" charEmbedding.put(artificialChar) inputGenerators.append( CharacterWindowGenerator(charEmbedding, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol)) unsupervisedLabelSource = ConstantLabel(domainLexicon, "0") # Reading supervised and unsupervised data sets. trainSupervisedDatasetReader = TokenLabelReader( kwargs["train_source"], kwargs["token_label_separator"]) trainSupervisedBatch = SyncBatchIterator( trainSupervisedDatasetReader, inputGenerators, [outputGeneratorTag, unsupervisedLabelSource], batchSize[0], shuffle=shuffle) # Get Unsupervised Input unsupervisedLabelTarget = ConstantLabel(domainLexicon, "1") trainUnsupervisedDatasetReader = TokenReader(kwargs["train_target"]) trainUnsupervisedDatasetBatch = SyncBatchIterator( trainUnsupervisedDatasetReader, inputGenerators, [unsupervisedLabelTarget], batchSize[1], shuffle=shuffle) # Stopping to add new words, labels and chars wordEmbedding.stopAdd() tagLexicon.stopAdd() domainLexicon.stopAdd() if withCharWNN: charEmbedding.stopAdd() # Printing embedding information dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Size of word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize)) log.info( "Size of char dictionary and char embedding size: %d and %d" % (charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize())) # Word Embedding Normalization if normalization == "zscore": wordEmbedding.zscoreNormalization() elif normalization == "minmax": wordEmbedding.minMaxNormalization() elif normalization == "mean": wordEmbedding.meanNormalization() elif normalization == "none" or not normalization: pass else: raise Exception() # Source input wordWindowSource = T.lmatrix(name="windowSource") sourceInput = [wordWindowSource] # Create the layers related with the extractor of features embeddingLayerSrc = EmbeddingLayer(wordWindowSource, wordEmbedding.getEmbeddingMatrix(), trainable=True) flattenSrc = FlattenLayer(embeddingLayerSrc) if withCharWNN: log.info("Using charwnn") # Create the charwn charWindowIdxSrc = T.ltensor4(name="char_window_idx_source") sourceInput.append(charWindowIdxSrc) charEmbeddingConvLayerSrc = EmbeddingConvolutionalLayer( charWindowIdxSrc, charEmbedding.getEmbeddingMatrix(), numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct) layerBeforeLinearSrc = ConcatenateLayer( [flattenSrc, charEmbeddingConvLayerSrc]) sizeLayerBeforeLinearSrc = wordWindowSize * ( wordEmbedding.getEmbeddingSize() + convSize) else: layerBeforeLinearSrc = flattenSrc sizeLayerBeforeLinearSrc = wordWindowSize * wordEmbedding.getEmbeddingSize( ) if activationHiddenExtractor == "tanh": log.info("Using tanh in the hidden layer of extractor") linear1 = LinearLayer(layerBeforeLinearSrc, sizeLayerBeforeLinearSrc, hiddenLayerSize, weightInitialization=GlorotUniform()) act1 = ActivationLayer(linear1, tanh) elif activationHiddenExtractor == "sigmoid": log.info("Using sigmoid in the hidden layer of extractor") linear1 = LinearLayer(layerBeforeLinearSrc, sizeLayerBeforeLinearSrc, hiddenLayerSize, weightInitialization=SigmoidGenerator()) act1 = ActivationLayer(linear1, sigmoid) else: raise Exception() # Create the layers with the Tagger if supHiddenLayerSize == 0: layerBeforeSupSoftmax = act1 layerSizeBeforeSupSoftmax = hiddenLayerSize log.info("It didn't insert the layer before the supervised softmax.") else: linear2 = LinearLayer(act1, hiddenLayerSize, supHiddenLayerSize, weightInitialization=GlorotUniform()) act2 = ActivationLayer(linear2, tanh) layerBeforeSupSoftmax = act2 layerSizeBeforeSupSoftmax = supHiddenLayerSize log.info("It inserted the layer before the supervised softmax.") supervisedLinear = LinearLayer(layerBeforeSupSoftmax, layerSizeBeforeSupSoftmax, tagLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) supervisedSoftmax = ActivationLayer(supervisedLinear, softmax) # Create the layers with the domain classifier gradientReversalSource = GradientReversalLayer(act1, _lambda) if unsupHiddenLayerSize == 0: layerBeforeUnsupSoftmax = gradientReversalSource layerSizeBeforeUnsupSoftmax = hiddenLayerSize log.info("It didn't insert the layer before the unsupervised softmax.") else: unsupervisedSourceLinearBf = LinearLayer( gradientReversalSource, hiddenLayerSize, unsupHiddenLayerSize, weightInitialization=GlorotUniform()) actUnsupervisedSourceBf = ActivationLayer(unsupervisedSourceLinearBf, tanh) layerBeforeUnsupSoftmax = actUnsupervisedSourceBf layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize log.info("It inserted the layer before the unsupervised softmax.") unsupervisedSourceLinear = LinearLayer( layerBeforeUnsupSoftmax, layerSizeBeforeUnsupSoftmax, domainLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) unsupervisedSourceSoftmax = ActivationLayer(unsupervisedSourceLinear, softmax) ## Target Part windowTarget = T.lmatrix(name="windowTarget") targetInput = [windowTarget] # Create the layers related with the extractor of features embeddingLayerUnsuper1 = EmbeddingLayer( windowTarget, embeddingLayerSrc.getParameters()[0], trainable=True) flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1) if withCharWNN: log.info("Using charwnn") # Create the charwn charWindowIdxTgt = T.ltensor4(name="char_window_idx_target") targetInput.append(charWindowIdxTgt) charEmbeddingConvLayerTgt = EmbeddingConvolutionalLayer( charWindowIdxTgt, charEmbeddingConvLayerSrc.getParameters()[0], numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct, trainable=True) layerBeforeLinearTgt = ConcatenateLayer( [flattenUnsuper1, charEmbeddingConvLayerTgt]) sizeLayerBeforeLinearTgt = wordWindowSize * ( wordEmbedding.getEmbeddingSize() + convSize) else: layerBeforeLinearTgt = flattenUnsuper1 sizeLayerBeforeLinearTgt = wordWindowSize * wordEmbedding.getEmbeddingSize( ) w, b = linear1.getParameters() linearUnsuper1 = LinearLayer(layerBeforeLinearTgt, sizeLayerBeforeLinearTgt, hiddenLayerSize, W=w, b=b, trainable=True) if activationHiddenExtractor == "tanh": log.info("Using tanh in the hidden layer of extractor") actUnsupervised1 = ActivationLayer(linearUnsuper1, tanh) elif activationHiddenExtractor == "sigmoid": log.info("Using sigmoid in the hidden layer of extractor") actUnsupervised1 = ActivationLayer(linearUnsuper1, sigmoid) else: raise Exception() # Create the layers with the domain classifier grandientReversalTarget = GradientReversalLayer(actUnsupervised1, _lambda) if unsupHiddenLayerSize == 0: layerBeforeUnsupSoftmax = grandientReversalTarget layerSizeBeforeUnsupSoftmax = hiddenLayerSize log.info("It didn't insert the layer before the unsupervised softmax.") else: w, b = unsupervisedSourceLinearBf.getParameters() unsupervisedTargetLinearBf = LinearLayer(grandientReversalTarget, hiddenLayerSize, unsupHiddenLayerSize, W=w, b=b, trainable=True) actUnsupervisedTargetLinearBf = ActivationLayer( unsupervisedTargetLinearBf, tanh) layerBeforeUnsupSoftmax = actUnsupervisedTargetLinearBf layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize log.info("It inserted the layer before the unsupervised softmax.") w, b = unsupervisedSourceLinear.getParameters() unsupervisedTargetLinear = LinearLayer(layerBeforeUnsupSoftmax, layerSizeBeforeUnsupSoftmax, domainLexicon.getLen(), W=w, b=b, trainable=True) unsupervisedTargetSoftmax = ActivationLayer(unsupervisedTargetLinear, softmax) # Set loss and prediction and retrieve all layers supervisedLabel = T.lvector("supervisedLabel") unsupervisedLabelSource = T.lvector("unsupervisedLabelSource") unsupervisedLabelTarget = T.lvector("unsupervisedLabelTarget") supervisedOutput = supervisedSoftmax.getOutput() supervisedPrediction = ArgmaxPrediction(1).predict(supervisedOutput) supervisedLoss = NegativeLogLikelihood().calculateError( supervisedOutput, supervisedPrediction, supervisedLabel) unsupervisedOutputSource = unsupervisedSourceSoftmax.getOutput() unsupervisedPredSource = ArgmaxPrediction(1).predict( unsupervisedOutputSource) unsupervisedLossSource = NegativeLogLikelihood().calculateError( unsupervisedOutputSource, None, unsupervisedLabelSource) unsupervisedOutputTarget = unsupervisedTargetSoftmax.getOutput() unsupervisedPredTarget = ArgmaxPrediction(1).predict( unsupervisedOutputTarget) unsupervisedLossTarget = NegativeLogLikelihood().calculateError( unsupervisedOutputTarget, None, unsupervisedLabelTarget) # Creates model if useAdagrad: log.info("Using ADAGRAD") opt = Adagrad(lr=lr, decay=decay) else: log.info("Using SGD") opt = SGD(lr=lr, decay=decay) allLayersSource = supervisedSoftmax.getLayerSet( ) | unsupervisedSourceSoftmax.getLayerSet() allLayersTarget = unsupervisedTargetSoftmax.getLayerSet() unsupervisedLossTarget *= float( trainSupervisedBatch.size()) / trainUnsupervisedDatasetBatch.size() supervisedTrainMetrics = [ LossMetric("TrainSupervisedLoss", supervisedLoss), AccuracyMetric("TrainSupervisedAcc", supervisedLabel, supervisedPrediction), LossMetric("TrainUnsupervisedLoss", unsupervisedLossSource), AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelSource, unsupervisedPredSource) ] unsupervisedTrainMetrics = [ LossMetric("TrainUnsupervisedLoss", unsupervisedLossTarget), AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelTarget, unsupervisedPredTarget) ] evalMetrics = [ AccuracyMetric("EvalAcc", supervisedLabel, supervisedPrediction) ] testMetrics = [ AccuracyMetric("TestAcc", supervisedLabel, supervisedPrediction) ] #TODO: Não tive tempo de testar o código depois das modificações GradientReversalModel(sourceInput, targetInput, supervisedLabel, unsupervisedLabelSource, unsupervisedLabelTarget, allLayersSource, allLayersTarget, opt, supervisedPrediction, supervisedLoss, unsupervisedLossSource, unsupervisedLossTarget, supervisedTrainMetrics, unsupervisedTrainMetrics, evalMetrics, testMetrics, mode=None) # Get dev inputs and output log.info("Reading development examples") devDatasetReader = TokenLabelReader(kwargs["dev"], kwargs["token_label_separator"]) devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGeneratorTag], sys.maxint, shuffle=False) callbacks = [] # log.info("Usando lambda fixo: " + str(_lambda.get_value())) log.info("Usando lambda variado. alpha=" + str(kwargs["alpha"]) + " height=" + str(kwargs["height"])) callbacks.append( ChangeLambda(_lambda, kwargs["alpha"], numEpochs, kwargs["height"])) if kwargs["additional_dev"]: callbacks.append( AdditionalDevDataset(model, kwargs["additional_dev"], kwargs["token_label_separator"], inputGenerators, outputGeneratorTag)) # Training Model model.train([trainSupervisedBatch, trainUnsupervisedDatasetBatch], numEpochs, devReader, callbacks=callbacks)
def trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow, labelLexicon): # Build neural network. wordWindow = T.lmatrix("word_window") inputModel = [wordWindow] wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), borrow=borrow, structGrad=args.struct_grad, trainable=True, name="word_embedding_layer") flatWordEmbedding = FlattenLayer(wordEmbeddingLayer) charWindowIdxs = T.ltensor4(name="char_window_idx") inputModel.append(charWindowIdxs) # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(), 20, args.conv_size, args.char_window_size, args.char_emb_size, tanh, structGrad=args.char_struct_grad, name="char_convolution_layer", borrow=borrow) layerBeforeLinear = ConcatenateLayer([flatWordEmbedding, charEmbeddingConvLayer]) sizeLayerBeforeLinear = args.word_window_size * (wordEmbedding.getEmbeddingSize() + args.conv_size) hiddenActFunction = method_name(args.hidden_activation_function) weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform() linearHidden = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, args.hidden_size, weightInitialization=weightInit, name="linear1") actHidden = ActivationLayer(linearHidden, hiddenActFunction) linearSoftmax = LinearLayer(actHidden, args.hidden_size, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator(), name="linear_softmax") actSoftmax = ActivationLayer(linearSoftmax, softmax) prediction = ArgmaxPrediction(1).predict(actSoftmax.getOutput()) # Output symbolic tensor variable. y = T.lvector("y") if args.decay.lower() == "normal": decay = 0.0 elif args.decay.lower() == "divide_epoch": decay = 1.0 else: log.error("Unknown decay argument: %s" % args.decay) sys.exit(1) if args.adagrad: log.info("Training algorithm: Adagrad") opt = Adagrad(lr=args.lr, decay=decay) else: log.info("Training algorithm: SGD") opt = SGD(lr=args.lr, decay=decay) # Training loss function. loss = NegativeLogLikelihood().calculateError(actSoftmax.getOutput(), prediction, y) # L2 regularization. if args.l2: loss += args.l2 * (T.sum(T.square(linearHidden.getParameters()[0]))) # # TODO: debug # opt.lr.tag.test_value = 0.02 # Metrics. trainMetrics = [ LossMetric("LossTrain", loss, True), AccuracyMetric("AccTrain", y, prediction) ] evalMetrics = None if args.dev: evalMetrics = [ LossMetric("LossDev", loss, True), AccuracyMetric("AccDev", y, prediction), CustomMetric("CustomMetricDev", y, prediction) ] testMetrics = None if args.test: testMetrics = [ CustomMetric("CustomMetricTest", y, prediction) ] log.info("Compiling the network...") # # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None wnnModel = BasicModel(inputModel, [y], actSoftmax.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) log.info("Training...") wnnModel.train(trainIterator, args.num_epochs, devIterator)
def optimizer(optim_name, epsilon=None, gamma=None): if optim_name == 'SGD': optim = SGD(gamma=gamma, epsilon=epsilon) else: raise NotImplementedError return optim
def main(args): log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) loadPath = args.load_model if loadPath: with codecs.open(loadPath + ".param", "r", encoding="utf-8") as paramsFile: param = json.load(paramsFile, encoding="utf-8") hiddenActFunctionName = param['hiddenActFunction'] hiddenActFunction = method_name(hiddenActFunctionName) # Loading Embedding log.info("Loading Model") wordEmbedding = EmbeddingFactory().createFromW2V( loadPath + ".wv", ChosenUnknownStrategy(param["unknown"])) labelLexicon = Lexicon() for l in param["labels"]: labelLexicon.put(l) labelLexicon.stopAdd() # Loading model labelWeights = np.load(loadPath + ".npy").item(0) W1 = labelWeights["W_Hidden"] b1 = labelWeights["b_Hidden"] W2 = labelWeights["W_Softmax"] b2 = labelWeights["b_Softmax"] hiddenLayerSize = b1.shape[0] else: W1 = None b1 = None W2 = None b2 = None hiddenActFunctionName = args.hidden_activation_function hiddenActFunction = method_name(hiddenActFunctionName) if args.word_embedding: log.info("Reading W2v File") wordEmbedding = EmbeddingFactory().createFromW2V( args.word_embedding, RandomUnknownStrategy()) wordEmbedding.stopAdd() elif args.hash_lex_size: wordEmbedding = RandomEmbedding(args.word_emb_size, RandomUnknownStrategy(), HashLexicon(args.hash_lex_size)) else: wordEmbedding = EmbeddingFactory().createRandomEmbedding( args.word_emb_size) # Get the inputs and output if args.labels: labelLexicon = createLexiconUsingFile(args.labels) else: labelLexicon = Lexicon() if args.load_hidden_layer: # Loading Hidden Layer log.info("Loading Hidden Layer") hl = np.load(args.load_hidden_layer).item(0) W1 = hl["W_Encoder"] b1 = hl["b_Encoder"] hiddenLayerSize = b1.shape[0] # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = T.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = T.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = SigmoidGlorot( ) if hiddenActFunction == sigmoid else GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=None, b=None, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # List of input layers (will be concatenated). inputLayers = [maxPooling] # Generate word windows. wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, wordEmbedding, filters, startSymbol, endSymbol) # List of input generators. inputGenerators = [ lambda offer: wordWindowFeatureGenerator(offer["tokens"]) ] concatenatedSize = convSize # Additional features. if args.categorical_features is not None: log.info("Using categorical features: %s" % str([ftr[0] for ftr in args.categorical_features])) for ftr in args.categorical_features: concatenatedSize += ftr[2] ftrLexicon = createLexiconUsingFile(ftr[1]) ftrEmbedding = RandomEmbedding( embeddingSize=ftr[2], unknownGenerateStrategy=RandomUnknownStrategy(), lexicon=ftrLexicon, ) ftrInput = T.lscalar("in_" + ftr[0]) ftrLayer = EmbeddingLayer(ftrInput, ftrEmbedding.getEmbeddingMatrix()) inputGenerators.append( lambda offer: ftrLexicon.put(offer[ftr[0]].strip().lower())) inputTensors.append(ftrInput) inputLayers.append(ftrLayer) log.info("Input layers: %s" % str(inputLayers)) # Concatenate all input layers, when there are more thean one input layer. concatenatedInLayers = maxPooling if len( inputLayers) == 1 else ConcatenateLayer(inputLayers, axis=0) if args.include_hidden_layer: # Hidden layer. hiddenLinear = LinearLayer(concatenatedInLayers, concatenatedSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction) else: # Do not use a hidden layer. log.info("Not using hidden layer!") hiddenAct = concatenatedInLayers hiddenLayerSize = concatenatedSize # Entrada linear da camada softmax. sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Class weights. labelWeights = None if args.labels_probs: numLabels = labelLexicon.getLen() labelWeights = np.zeros(numLabels, dtype=theano.config.floatX) if args.labels_probs.startswith("@"): # Load the dictionary from a JSON file. with codecs.open(args.labels_probs[1:], mode="r", encoding="utf8") as f: labelDistribution = json.load(f) else: # The argument value is already a JSON. labelDistribution = json.loads(args.labels_probs) for k, v in labelDistribution.items(): # The weight of a class is inversely-proportional to its frequency. labelWeights[labelLexicon.getLexiconIndex(k)] = 1.0 / v if args.labels_weights_log: # Attenuate weights for highly unbalanced classes. labelWeights = np.log(labelWeights) log.info("Label weights: " + str(labelWeights)) # Loss function. loss = NegativeLogLikelihoodOneExample(labelWeights).calculateError( softmaxAct.getOutput()[0], prediction, outLabel) # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] if args.train: trainDatasetReader = OfertasReader(args.train) if args.load_method == "sync": log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, intputGenerators, outputGenerators, -1, shuffle=shuffle) wordEmbedding.stopAdd() elif args.load_method == "async": log.info("Examples will be asynchronously loaded.") trainIterator = AsyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, -1, shuffle=shuffle, maxqSize=1000) else: log.error("The argument 'load_method' has an invalid value: %s." % args.load_method) sys.exit(1) labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error( "Argument eval_per_iteration cannot be used without a dev argument." ) sys.exit(1) if dev: log.info("Reading development examples") devReader = OfertasReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, -1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) if normalizeMethod is not None and loadPath is not None: log.warn( "The word embedding of model was normalized. This can change the result of test." ) # if kwargs["lambda"]: # _lambda = kwargs["lambda"] # log.info("Using L2 with lambda= %.2f", _lambda) # loss += _lambda * (T.sum(T.square(hiddenLinear.getParameters()[0]))) # Decaimento da taxa de aprendizado. decay = 0.0 if args.decay == "linear": decay = 1.0 # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error( "Unknown algorithm: %s. Expected values are: adagrad or sgd." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction), FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction), FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] if args.test_probs: # Append predicted probabilities for the test set. testMetrics.append( PredictedProbabilities("TestProbs", softmaxAct.getOutput())) else: if args.test_probs: log.error( "The option test_probs requires a test dataset (option test).") sys.exit(1) # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = Model(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: callback = [] if args.save_model: savePath = args.save_model modelWriter = OfertasModelWritter(savePath, embeddingLayer, hiddenLinear, sotmaxLinearInput, wordEmbedding, labelLexicon, hiddenActFunctionName) callback.append(SaveModelCallback(modelWriter, "eval_acc", True)) log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration, callbacks=callback) # Testing if args.test: log.info("Reading test examples") testReader = OfertasReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, -1, shuffle=False) log.info("Testing") model.test(testIterator)
learning_rate = 0.01 # OPTIMIZER OPTIMIZER = 'SGD' assert DATA_NAME in ['Contracept', 'Heart', 'Yeast'] assert OPTIMIZER == 'SGD' # Load dataset, model and evaluation metric train_data, test_data, logistic_regression = _initialize(DATA_NAME) train_x, train_y = train_data num_data, num_features = train_x.shape num_class = len(np.unique(train_y)) optim = SGD() model = LogisticRegression(num_features) loss, epoch = model.train(train_x, train_y, batch_size, num_epochs, learning_rate, optim) test_x, test_y = test_data hypo, pred = model.predict(test_x) pred = np.squeeze(pred) # ======== Edit here ========= result = evaluation_test1(test_y, pred, at=90) # ============================= for key in result.keys(): print(key, '\t\t\t:\t\t %.6f' % result[key])
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 3: log.error("Missing argument: <JSON config file> or/and <Input file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error( "You must provide argument word_embedding or word_lexicon and word_emb_size" ) # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error( "Only one of the parameters label_lexicon and labels can be provided!" ) exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error( "One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len( args.label_weights) != labelLexicon.getLen(): log.error( "Number of label weights (%d) is different from number of labels (%d)!" % (len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [ WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) ] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, mode=mode) wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) # GETS HIDDEN LAYER: # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow) # GRAPH FOR PREDICTION LAYER graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode) lblTxt = ["Sim", "Nao"] tweets = [] with open(sys.argv[2]) as inputFile: content = inputFile.readlines() for line in content: tweets.append(line.decode('utf-8').encode('utf-8')) #print tweets # graph.getResultsFor(t) retorna a predição para dado Tweet t try: output_file = open("Output.txt", "w") except: print "Falha em criar o arquivo de saida\n" try: for t in tweets: output_file.write( t.replace('\n', '').replace('\t', '') + "\t " + lblTxt[graph.getResultsFor(t)] + "\n") print "Resultados gerados com sucesso!\n" except: print "Erro na geração de resultados\n"
def main(args): log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.wv_normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None hiddenActFunction = tanh if args.word_embedding: log.info("Reading W2v File") (lexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol='unknown') lexicon.stopAdd() else: wordEmbedding = EmbeddingFactory().createRandomEmbedding( args.word_emb_size) # Get the inputs and output if args.labels: labelLexicon = Lexicon.fromTextFile(args.labels, hasUnknowSymbol=False) else: labelLexicon = Lexicon() # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = T.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = T.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = SigmoidGlorot( ) if hiddenActFunction == sigmoid else GlorotUniform() # Convolution layer. Convolução no texto de um documento. convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=None, b=None, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Generate word windows. wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, lexicon, filters, startSymbol, endSymbol) # List of input generators. inputGenerators = [wordWindowFeatureGenerator] # Hidden layer. hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction) # Entrada linear da camada softmax. sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. loss = NegativeLogLikelihoodOneExample().calculateError( softmaxAct.getOutput()[0], prediction, outLabel) # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] if args.train: trainDatasetReader = DocReader(args.train) log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, -1, shuffle=shuffle) lexicon.stopAdd() labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error( "Argument eval_per_iteration cannot be used without a dev argument." ) sys.exit(1) if dev: log.info("Reading development examples") devReader = DocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, -1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. if args.decay == "linear": decay = 1.0 elif args.decay == "none": decay = 0.0 else: log.error("Unknown decay strategy %s. Expected: none or linear." % args.decay) sys.exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error( "Unknown algorithm: %s. Expected values are: adagrad or sgd." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Testing if args.test: log.info("Reading test examples") testReader = DocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, -1, shuffle=False) log.info("Testing") model.test(testIterator)
def main(**kwargs): log = logging.getLogger(__name__) log.info(kwargs) if kwargs["seed"] != None: random.seed(kwargs["seed"]) np.random.seed(kwargs["seed"]) filters = [] for filterName in kwargs["filters"]: moduleName, className = filterName.rsplit('.', 1) log.info("Usando o filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) # Get the inputs and output wordWindowSize = kwargs["word_window_size"] hiddenLayerSize = kwargs["hidden_size"] batchSize = kwargs["batch_size"] startSymbol = kwargs["start_symbol"] numEpochs = kwargs["num_epochs"] lr = kwargs["lr"] labelLexicon = createLexiconUsingFile(kwargs["label_file"]) log.info("Reading training examples") log.info("Reading W2v File1") embedding1 = EmbeddingFactory().createFromW2V(kwargs["word_embedding1"], RandomUnknownStrategy()) # Supervised part # Learner1 input1 = T.lmatrix(name="input1") embeddingLayer1 = EmbeddingLayer(input1, embedding1.getEmbeddingMatrix(), trainable=True) flatten1 = FlattenLayer(embeddingLayer1) linear11 = LinearLayer(flatten1, wordWindowSize * embedding1.getEmbeddingSize(), hiddenLayerSize, weightInitialization=GlorotUniform()) act11 = ActivationLayer(linear11, tanh) linear12 = LinearLayer(act11, hiddenLayerSize, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) act12 = ActivationLayer(linear12, softmax) ## Learner2 log.info("Reading W2v File2") embedding2 = EmbeddingFactory().createFromW2V(kwargs["word_embedding2"], RandomUnknownStrategy()) input2 = T.lmatrix(name="input2") embeddingLayer2 = EmbeddingLayer(input2, embedding2.getEmbeddingMatrix(), trainable=True) flatten2 = FlattenLayer(embeddingLayer2) linear21 = LinearLayer(flatten2, wordWindowSize * embedding2.getEmbeddingSize(), hiddenLayerSize, weightInitialization=GlorotUniform()) act21 = ActivationLayer(linear21, tanh) linear22 = LinearLayer(act21, hiddenLayerSize, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) act22 = ActivationLayer(linear22, softmax) y = T.lvector("y") # Set loss and prediction and retrieve all layers output1 = act12.getOutput() prediction1 = ArgmaxPrediction(1).predict(output1) loss1 = NegativeLogLikelihood().calculateError(output1, prediction1, y) if kwargs["l2"][0]: _lambda1 = kwargs["l2"][0] log.info("Using L2 with lambda= %.2f", _lambda1) loss1 += _lambda1 * (T.sum(T.square(linear11.getParameters()[0]))) output2 = act22.getOutput() prediction2 = ArgmaxPrediction(1).predict(output2) loss2 = NegativeLogLikelihood().calculateError(output2, prediction2, y) if kwargs["l2"][1]: _lambda2 = kwargs["l2"][1] log.info("Using L2 with lambda= %.2f", _lambda2) loss2 += _lambda2 * (T.sum(T.square(linear21.getParameters()[0]))) loss = loss1 + loss2 ## CoLearningPrediction output = T.stack([linear12.getOutput(), linear22.getOutput()]) # return T.argmax(output, 2)[T.argmax(T.max(output, 2), 0),T.arange(output.shape[1])] average = T.mean(output, 0) prediction = ArgmaxPrediction(1).predict( ActivationLayer(average, softmax).getOutput()) # prediction = CoLearningWnnPrediction().predict([output1, output2]) supervisedModeUnit = ModelUnit("supervised_wnn", [input1, input2], y, loss, prediction=prediction) # Unsupervised part ## Learner1 inputUnsuper1 = T.lmatrix(name="input_unsupervised_1") embeddingLayerUnsuper1 = EmbeddingLayer(inputUnsuper1, embeddingLayer1.getParameters()[0], trainable=True) flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1) w, b = linear11.getParameters() linearUnsuper11 = LinearLayer(flattenUnsuper1, wordWindowSize * embedding1.getEmbeddingSize(), hiddenLayerSize, W=w, b=b) actUnsupervised11 = ActivationLayer(linearUnsuper11, tanh) w, b = linear12.getParameters() linearUnsuper12 = LinearLayer(actUnsupervised11, hiddenLayerSize, labelLexicon.getLen(), W=w, b=b) actUnsuper12 = ActivationLayer(linearUnsuper12, softmax) ## Learner2 inputUnsuper2 = T.lmatrix(name="input_unsupervised_2") embeddingLayerUnsuper2 = EmbeddingLayer(inputUnsuper2, embeddingLayer2.getParameters()[0], trainable=True) flattenUnsuper2 = FlattenLayer(embeddingLayerUnsuper2) w, b = linear21.getParameters() linearUnsuper21 = LinearLayer(flattenUnsuper2, wordWindowSize * embedding2.getEmbeddingSize(), hiddenLayerSize, W=w, b=b) actUnsuper21 = ActivationLayer(linearUnsuper21, tanh) w, b = linear22.getParameters() linearUnsuper22 = LinearLayer(actUnsuper21, hiddenLayerSize, labelLexicon.getLen(), W=w, b=b) actUnsuper22 = ActivationLayer(linearUnsuper22, softmax) # Set loss and prediction and retrieve all layers outputUns1 = actUnsuper12.getOutput() predictionUns1 = ArgmaxPrediction(1).predict(outputUns1) outputUns2 = actUnsuper22.getOutput() predictionUns2 = ArgmaxPrediction(1).predict(outputUns2) # # unsupervisedLoss = kwargs["lambda"] * ( # NegativeLogLikelihood().calculateError(outputUns1, predictionUns1, predictionUns2) + # NegativeLogLikelihood().calculateError(outputUns2, predictionUns2, predictionUns1)) _lambdaShared = theano.shared(value=kwargs["lambda"], name='lambda', borrow=True) unsupervisedLoss = _lambdaShared * (NegativeLogLikelihood().calculateError( outputUns1, predictionUns1, predictionUns2) + NegativeLogLikelihood( ).calculateError(outputUns2, predictionUns2, predictionUns1)) unsupervisedUnit = ModelUnit("unsupervised_wnn", [inputUnsuper1, inputUnsuper2], None, unsupervisedLoss, yWillBeReceived=False) # Creates model model = CoLearningModel(kwargs["loss_uns_epoch"]) model.addTrainingModelUnit(supervisedModeUnit, metrics=["loss", "acc"]) model.addTrainingModelUnit(unsupervisedUnit, metrics=["loss"]) model.setEvaluatedModelUnit(supervisedModeUnit, metrics=["acc"]) # Compile Model opt1 = SGD(lr=lr[0], decay=1.0) opt2 = SGD(lr=lr[1], decay=1.0) log.info("Compiling the model") model.compile([(opt1, { supervisedModeUnit: act12.getLayerSet(), unsupervisedUnit: actUnsuper12.getLayerSet() }), (opt2, { supervisedModeUnit: act22.getLayerSet(), unsupervisedUnit: actUnsuper22.getLayerSet() })]) # Generators inputGenerator1 = WordWindowGenerator(wordWindowSize, embedding1, filters, startSymbol) inputGenerator2 = WordWindowGenerator(wordWindowSize, embedding2, filters, startSymbol) outputGenerator = LabelGenerator(labelLexicon) # Reading supervised and unsupervised data sets. trainSupervisedDatasetReader = TokenLabelReader( kwargs["train_supervised"], kwargs["token_label_separator"]) trainSupervisedDatasetReader = SyncBatchIterator( trainSupervisedDatasetReader, [inputGenerator1, inputGenerator2], [outputGenerator], batchSize[0]) trainUnsupervisedDataset = TokenReader(kwargs["train_unsupervised"]) trainUnsupervisedDatasetReader = SyncBatchIterator( trainUnsupervisedDataset, [inputGenerator1, inputGenerator2], None, batchSize[1]) embedding1.stopAdd() embedding2.stopAdd() labelLexicon.stopAdd() # Get dev inputs and output log.info("Reading development examples") devDatasetReader = TokenLabelReader(kwargs["dev"], kwargs["token_label_separator"]) devReader = SyncBatchIterator(devDatasetReader, [inputGenerator1, inputGenerator2], [outputGenerator], sys.maxint, shuffle=False) lambdaChange = ChangeLambda(_lambdaShared, kwargs["lambda"], kwargs["loss_uns_epoch"]) lossCallback = LossCallback(loss1, loss2, input1, input2, y) # trainUnsupervisedDatasetReaderAcc = SyncBatchIterator(trainUnsupervisedDataset, # [inputGenerator1, inputGenerator2], # [outputGenerator], sys.maxint) # accCallBack = AccCallBack(prediction1, prediction2, input1, input2, # unsurpervisedDataset=trainUnsupervisedDatasetReaderAcc) # Training Model model.train([trainSupervisedDatasetReader, trainUnsupervisedDatasetReader], numEpochs, devReader, callbacks=[lambdaChange, lossCallback])
def mainWnnNegativeSampling(args): # Reading parameters embeddingMatrix = None wordEmbeddingSize = args.word_embedding_size windowSize = args.window_size hiddenLayerSize = args.hidden_size startSymbol = args.start_symbol # endSymbol = args.end_symbol endSymbol = startSymbol noiseRate = args.noise_rate # todo: o algoritmo não suporta mini batch. Somente treinamento estocástico. batchSize = 1 shuffle = args.shuffle lr = args.lr numEpochs = args.num_epochs power = args.power minLr = args.min_lr numExUpdLr = args.num_examples_updt_lr log = logging.getLogger(__name__) log.info(str(args)) if args.seed: random.seed(args.seed) np.random.seed(args.seed) # # if args.decay.lower() == "normal": # decay = 0.0 # elif args.decay.lower() == "divide_epoch": # decay = 1.0 parametersToSaveOrLoad = {"hidden_size", "window_size", "start_symbol"} # Calculate the frequency of each word trainReader = TokenReader(args.train) wordLexicon = Lexicon("UUKNNN", "lexicon") wordLexicon.put(startSymbol, False) totalNumOfTokens = 0 for tokens, labels in trainReader.read(): # we don't count the </s>, because this token is only insert in the sentence to count its frequency. totalNumOfTokens += len(tokens) # Word2vec considers that the number of lines is the frequency of </s> tokens += [startSymbol] for token in tokens: wordLexicon.put(token) # Prune the words with the frequency less than min_count wordLexicon.prune(args.min_count) wordLexicon.stopAdd() # Calculte the unigram distribution frequency = np.power(wordLexicon.getFrequencyOfAllWords(), power) total = float(frequency.sum()) # # Print the distribution of all words # for _ in xrange(len(frequency)): # print "%s\t%d\t%.4f" % (wordLexicon.getLexicon(_), frequency[_],frequency[_]/float(total)) sampler = Sampler(frequency / float(total)) # Create a random embedding for each word wordEmbedding = Embedding(wordLexicon, None, wordEmbeddingSize) log.info("Lexicon size: %d" % (wordLexicon.getLen())) # Create NN x = T.lmatrix("word_window") y = T.lvector("labels") wordEmbeddingLayer = EmbeddingLayer(x, wordEmbedding.getEmbeddingMatrix(), name="embedding") flatten = FlattenLayer(wordEmbeddingLayer) linear1 = LinearLayer(flatten, wordEmbeddingSize * windowSize, hiddenLayerSize, name="linear1") act1 = ActivationLayer(linear1, tanh) # Softmax regression. It's like a logistic regression linear2 = LinearLayer(act1, hiddenLayerSize, 1, weightInitialization=ZeroWeightGenerator(), name="linear_softmax_regresion") act2 = ActivationLayer(linear2, sigmoid) # We clip the output of -sigmoid, because this output can be 0 and ln(0) is infinite, which can cause problems. output = T.flatten(T.clip(act2.getOutput(), 10**-5, 1 - 10**-5)) # Loss Functions negativeSamplingLoss = T.nnet.binary_crossentropy(output, y).sum() # Set training inputGenerators = [ WordWindowGenerator(windowSize, wordLexicon, [], startSymbol, endSymbol) ] outputGenerators = [ConstantLabel(labelLexicon=None, label=1)] trainIterator = SyncBatchIterator(trainReader, inputGenerators, outputGenerators, batchSize, shuffle) trainMetrics = [LossMetric("lossTrain", negativeSamplingLoss)] allLayers = act2.getLayerSet() # opt = SGD(lr=lr, decay=decay) opt = SGD(lr=lr) model = NegativeSamplingModel(args.t, noiseRate, sampler, minLr, numExUpdLr, totalNumOfTokens, numEpochs, [x], [y], allLayers, opt, negativeSamplingLoss, trainMetrics) # Save Model if args.save_model: savePath = args.save_model objsToSave = list(act2.getLayerSet()) + [wordLexicon] modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad) # Training model.train(trainIterator, numEpochs=numEpochs, callbacks=[]) if args.save_model: modelWriter.save()
def mainWnn(args): ################################################ # Initializing parameters ############################################## log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters", "alg", "hidden_activation_function", "word_window_size", "char_window_size", "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size", "use_capitalization", "start_symbol", "end_symbol", "with_hidden"} # Load parameters of the saving model if args.load_model: persistentManager = H5py(args.load_model) savedParameters = json.loads(persistentManager.getAttribute("parameters")) if savedParameters.get("charwnn_filters", None) != None: savedParameters["char_filters"] = savedParameters["charwnn_filters"] savedParameters.pop("charwnn_filters") print savedParameters log.info("Loading parameters of the model") args = args._replace(**savedParameters) log.info(str(args)) # Read the parameters lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization.lower() if args.normalization is not None else None wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size hiddenActFunctionName = args.hidden_activation_function embeddingSize = args.word_emb_size withCharWNN = args.with_charwnn charEmbeddingSize = args.char_emb_size charWindowSize = args.char_window_size startSymbolChar = "</s>" suffixEmbSize = args.suffix_emb_size capEmbSize = args.cap_emb_size useSuffixFeatures = args.suffix_size > 0 useCapFeatures = args.use_capitalization # Insert the character that will be used to fill the matrix # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication. artificialChar = "ART_CHAR" # TODO: the maximum number of characters of word is fixed in 20. numMaxChar = 20 if args.alg == "window_stn": isSentenceModel = True elif args.alg == "window_word": isSentenceModel = False else: raise Exception("The value of model_type isn't valid.") batchSize = -1 if isSentenceModel else args.batch_size wordFilters = [] # Lendo Filtros do wnn log.info("Lendo filtros básicos") wordFilters = getFilters(args.word_filters, log) # Lendo Filtros do charwnn log.info("Lendo filtros do charwnn") charFilters = getFilters(args.char_filters, log) # Lendo Filtros do suffix log.info("Lendo filtros do sufixo") suffixFilters = getFilters(args.suffix_filters, log) # Lendo Filtros da capitalização log.info("Lendo filtros da capitalização") capFilters = getFilters(args.cap_filters, log) ################################################ # Create the lexicon and go out after this ################################################ if args.create_only_lexicon: inputGenerators = [] lexiconsToSave = [] if args.word_lexicon and not os.path.exists(args.word_lexicon): wordLexicon = Lexicon("UUUNKKK", "labelLexicon") inputGenerators.append( WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)) lexiconsToSave.append((wordLexicon, args.word_lexicon)) if not os.path.exists(args.label_file): labelLexicon = Lexicon(None, "labelLexicon") outputGenerator = [LabelGenerator(labelLexicon)] lexiconsToSave.append((labelLexicon, args.label_file)) else: outputGenerator = None if args.char_lexicon and not os.path.exists(args.char_lexicon): charLexicon = Lexicon("UUUNKKK", "charLexicon") charLexicon.put(startSymbolChar) charLexicon.put(artificialChar) inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) lexiconsToSave.append((charLexicon, args.char_lexicon)) if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon): suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon") if args.suffix_size <= 0: raise Exception( "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.") inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) lexiconsToSave.append((suffixLexicon, args.suffix_lexicon)) if args.cap_lexicon and not os.path.exists(args.cap_lexicon): capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon") inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) lexiconsToSave.append((capLexicon, args.cap_lexicon)) if len(inputGenerators) == 0: inputGenerators = None if not (inputGenerators or outputGenerator): log.info("All lexicons have been generated.") return trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize, shuffle=shuffle) for lexicon, pathToSave in lexiconsToSave: lexicon.save(pathToSave) log.info("Lexicons were generated with success!") return ################################################ # Starting training ########################################### if withCharWNN and (useSuffixFeatures or useCapFeatures): raise Exception("It's impossible to use hand-crafted features with Charwnn.") # Read word lexicon and create word embeddings if args.load_model: wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon") vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer") wordEmbedding = Embedding(wordLexicon, vectors) elif args.word_embedding: wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon") elif args.word_lexicon: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon") wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize) else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Read char lexicon and create char embeddings if withCharWNN: if args.load_model: charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "char_convolution_layer") charEmbedding = Embedding(charLexicon, vectors) elif args.char_lexicon: charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon") charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize) else: log.error("You need to set one of these parameters: load_model or char_lexicon") return else: # Read suffix lexicon if suffix size is greater than 0 if useSuffixFeatures: if args.load_model: suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "suffix_embedding") suffixEmbedding = Embedding(suffixLexicon, vectors) elif args.suffix_lexicon: suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon") suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize) else: log.error("You need to set one of these parameters: load_model or suffix_lexicon") return # Read capitalization lexicon if useCapFeatures: if args.load_model: capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "cap_embedding") capEmbedding = Embedding(capLexicon, vectors) elif args.cap_lexicon: capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon") capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize) else: log.error("You need to set one of these parameters: load_model or cap_lexicon") return # Read labels if args.load_model: labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon") elif args.label_file: labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon") else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Normalize the word embedding if not normalizeMethod: pass elif normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() else: log.error("Unknown normalization method: %s" % normalizeMethod) sys.exit(1) if normalizeMethod is not None and args.load_model is not None: log.warn("The word embedding of model was normalized. This can change the result of test.") # Build neural network if isSentenceModel: raise NotImplementedError("Sentence model is not implemented!") else: wordWindow = T.lmatrix("word_window") inputModel = [wordWindow] wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True, name="word_embedding_layer") flatten = FlattenLayer(wordEmbeddingLayer) if withCharWNN: # Use the convolution log.info("Using charwnn") convSize = args.conv_size if args.charwnn_with_act: charAct = tanh else: charAct = None charWindowIdxs = T.ltensor4(name="char_window_idx") inputModel.append(charWindowIdxs) charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(), numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct, name="char_convolution_layer") layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer]) sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize) elif useSuffixFeatures or useCapFeatures: # Use hand-crafted features concatenateInputs = [flatten] nmFetauresByWord = wordEmbedding.getEmbeddingSize() if useSuffixFeatures: log.info("Using suffix features") suffixInput = T.lmatrix("suffix_input") suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(), name="suffix_embedding") suffixFlatten = FlattenLayer(suffixEmbLayer) concatenateInputs.append(suffixFlatten) nmFetauresByWord += suffixEmbedding.getEmbeddingSize() inputModel.append(suffixInput) if useCapFeatures: log.info("Using capitalization features") capInput = T.lmatrix("capitalization_input") capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(), name="cap_embedding") capFlatten = FlattenLayer(capEmbLayer) concatenateInputs.append(capFlatten) nmFetauresByWord += capEmbedding.getEmbeddingSize() inputModel.append(capInput) layerBeforeLinear = ConcatenateLayer(concatenateInputs) sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord else: # Use only the word embeddings layerBeforeLinear = flatten sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize() # The rest of the NN if args.with_hidden: hiddenActFunction = method_name(hiddenActFunctionName) weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform() linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize, weightInitialization=weightInit, name="linear1") act1 = ActivationLayer(linear1, hiddenActFunction) layerBeforeSoftmax = act1 sizeLayerBeforeSoftmax = hiddenLayerSize log.info("Using hidden layer") else: layerBeforeSoftmax = layerBeforeLinear sizeLayerBeforeSoftmax = sizeLayerBeforeLinear log.info("Not using hidden layer") linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator(), name="linear_softmax") act2 = ActivationLayer(linear2, softmax) prediction = ArgmaxPrediction(1).predict(act2.getOutput()) # Load the model if args.load_model: alreadyLoaded = set([wordEmbeddingLayer]) for o in (act2.getLayerSet() - alreadyLoaded): if o.getName(): persistentManager.load(o) # Set the input and output inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)] if withCharWNN: inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) else: if useSuffixFeatures: inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) if useCapFeatures: inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) outputGenerator = LabelGenerator(labelLexicon) if args.train: log.info("Reading training examples") trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize, shuffle=shuffle) # Get dev inputs and output dev = args.dev if dev: log.info("Reading development examples") devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator) devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) else: devReader = None else: trainReader = None devReader = None y = T.lvector("y") if args.decay.lower() == "normal": decay = 0.0 elif args.decay.lower() == "divide_epoch": decay = 1.0 if args.adagrad: log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) else: log.info("Using SGD") opt = SGD(lr=lr, decay=decay) # Printing embedding information dictionarySize = wordEmbedding.getNumberOfVectors() log.info("Size of word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize)) if withCharWNN: log.info("Size of char dictionary and char embedding size: %d and %d" % ( charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize())) if useSuffixFeatures: log.info("Size of suffix dictionary and suffix embedding size: %d and %d" % ( suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize())) if useCapFeatures: log.info("Size of capitalization dictionary and capitalization embedding size: %d and %d" % ( capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize())) # Compiling loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y) if args.lambda_L2: _lambda = args.lambda_L2 log.info("Using L2 with lambda= %.2f", _lambda) loss += _lambda * (T.sum(T.square(linear1.getParameters()[0]))) trainMetrics = [ LossMetric("LossTrain", loss, True), AccuracyMetric("AccTrain", y, prediction), ] evalMetrics = [ LossMetric("LossDev", loss, True), AccuracyMetric("AccDev", y, prediction), ] testMetrics = [ LossMetric("LossTest", loss, True), AccuracyMetric("AccTest", y, prediction), ] wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None) # Training if trainReader: callback = [] if args.save_model: savePath = args.save_model objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon] if withCharWNN: objsToSave.append(charLexicon) if useSuffixFeatures: objsToSave.append(suffixLexicon) if useCapFeatures: objsToSave.append(capLexicon) modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad) # Save the model with best acc in dev if args.save_by_acc: callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True)) log.info("Training") wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback) # Save the model at the end of training if args.save_model and not args.save_by_acc: modelWriter.save() # Testing if args.test: log.info("Reading test examples") testDatasetReader = TokenLabelReader(args.test, args.token_label_separator) testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) log.info("Testing") wnnModel.test(testReader) if args.print_prediction: f = codecs.open(args.print_prediction, "w", encoding="utf-8") for x, labels in testReader: inputs = x predictions = wnnModel.prediction(inputs) for prediction in predictions: f.write(labelLexicon.getLexicon(prediction)) f.write("\n")