def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 2: log.error("Missing argument: <JSON config file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error("You must provide argument word_embedding or word_lexicon and word_emb_size") # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error("Only one of the parameters label_lexicon and labels can be provided!") exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error("One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) if args.conv_act: convOut = ActivationLayer(convLinear, tanh) else: convOut = convLinear # Max pooling layer. maxPooling = MaxPoolingLayer(convOut) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen(): log.error("Number of label weights (%d) is different from number of labels (%d)!" % ( len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if args.train: trainDatasetReader = ShortDocReader(args.train) if args.load_method == "sync": log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle) wordLexicon.stopAdd() elif args.load_method == "async": log.info("Examples will be asynchronously loaded.") trainIterator = AsyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle, maxqSize=1000) else: log.error("The argument 'load_method' has an invalid value: %s." % args.load_method) sys.exit(1) labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error("Argument eval_per_iteration cannot be used without a dev argument.") sys.exit(1) if dev: log.info("Reading development examples") devReader = ShortDocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, - 1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction), FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction), FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Saving model after training if args.save_wordEmbedding: embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon) log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding)) if args.save_conv: convLinear.save(args.save_conv) log.info("Saved convolution layer to file: %s" % (args.save_conv)) if args.save_hiddenLayer: hiddenLinear.save(args.save_hiddenLayer) log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer)) if args.save_softmax: sotmaxLinearInput.save(args.save_softmax) log.info("Saved softmax to file: %s" % (args.save_softmax)) # Testing if args.test: log.info("Reading test examples") testReader = ShortDocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, - 1, shuffle=False) log.info("Testing") model.test(testIterator)
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 3: log.error("Missing argument: <JSON config file> or/and <Input file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error( "You must provide argument word_embedding or word_lexicon and word_emb_size" ) # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error( "Only one of the parameters label_lexicon and labels can be provided!" ) exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error( "One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len( args.label_weights) != labelLexicon.getLen(): log.error( "Number of label weights (%d) is different from number of labels (%d)!" % (len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [ WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) ] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, mode=mode) wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) # GETS HIDDEN LAYER: # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow) # GRAPH FOR PREDICTION LAYER graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode) lblTxt = ["Sim", "Nao"] tweets = [] with open(sys.argv[2]) as inputFile: content = inputFile.readlines() for line in content: tweets.append(line.decode('utf-8').encode('utf-8')) #print tweets # graph.getResultsFor(t) retorna a predição para dado Tweet t try: output_file = open("Output.txt", "w") except: print "Falha em criar o arquivo de saida\n" try: for t in tweets: output_file.write( t.replace('\n', '').replace('\t', '') + "\t " + lblTxt[graph.getResultsFor(t)] + "\n") print "Resultados gerados com sucesso!\n" except: print "Erro na geração de resultados\n"
def mainWnnNer(args): # Initializing parameters. log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) log.info({"type": "args", "args": args}) # GPU configuration. log.info({"floatX": str(theano.config.floatX), "device": str(theano.config.device)}) # Parameters. # lr = args.lr # startSymbol = args.start_symbol # endSymbol = args.end_symbol # numEpochs = args.num_epochs # shuffle = args.shuffle # normalization = args.normalization # wordWindowSize = args.word_window_size # hiddenLayerSize = args.hidden_size # hiddenActFunctionName = args.hidden_activation_function # embeddingSize = args.word_emb_size # batchSize = args.batch_size # structGrad = args.struct_grad # charStructGrad = args.char_struct_grad # # charEmbeddingSize = args.char_emb_size # charWindowSize = args.char_window_size # charConvSize = args.conv_size # Word filters. log.info("Loading word filters...") wordFilters = getFilters(args.word_filters, log) # Loading/creating word lexicon and word embedding. if args.word_embedding is not None: log.info("Loading word embedding...") wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon") elif args.word_lexicon is not None: log.info("Loading word lexicon...") wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon") wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=args.word_emb_size) else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") sys.exit(1) # Loading char lexicon. log.info("Loading char lexicon...") charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon") # Character embedding. charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=args.char_emb_size) # Loading label lexicon. log.info("Loading label lexicon...") labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon") # Normalize the word embedding if args.normalization is not None: normFactor = 1 if args.norm_factor is not None: normFactor = args.norm_factor if args.normalization == "minmax": log.info("Normalizing word embedding: minmax") wordEmbedding.minMaxNormalization(norm_coef=normFactor) elif args.normalization == "mean": log.info("Normalizing word embedding: mean") wordEmbedding.meanNormalization(norm_coef=normFactor) else: log.error("Unknown normalization method: %s" % args.normalization) sys.exit(1) elif args.normFactor is not None: log.error("Parameter norm_factor cannot be present without normalization.") sys.exit(1) dictionarySize = wordEmbedding.getNumberOfVectors() log.info("Size of word lexicon is %d and word embedding size is %d" % (dictionarySize, args.word_emb_size)) # Setup the input and (golden) output generators (readers). inputGenerators = [ WordWindowGenerator(args.word_window_size, wordLexicon, wordFilters, args.start_symbol, args.end_symbol), CharacterWindowGenerator(lexicon=charLexicon, numMaxChar=20, charWindowSize=args.char_window_size, wrdWindowSize=args.word_window_size, artificialChar="ART_CHAR", startPadding="</s>", startPaddingWrd=args.start_symbol, endPaddingWrd=args.end_symbol, filters=getFilters([], log)) ] outputGenerator = LabelGenerator(labelLexicon) if args.cv is not None: log.info("Reading training examples...") trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators, [outputGenerator], args.batch_size, shuffle=args.shuffle, numCVFolds=args.cv.numFolds) cvGenerators = trainIterator.getCVGenerators() iFold = 0 numFolds = len(cvGenerators) for train, dev in cvGenerators: log.info({"cv": {"fold": iFold, "numFolds": numFolds}}) trainNetwork(args, log, trainIterator=train, devIterator=dev, wordEmbedding=wordEmbedding, charEmbedding=charEmbedding, borrow=False, labelLexicon=labelLexicon) else: log.info("Reading training examples...") trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators, [outputGenerator], args.batch_size, shuffle=args.shuffle) # Get dev inputs and (golden) outputs. devIterator = None if args.dev is not None: log.info("Reading development examples") devIterator = SyncBatchIterator(TokenLabelPerLineReader(args.dev, labelTknSep='\t'), inputGenerators, [outputGenerator], sys.maxint, shuffle=False) trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow=True, labelLexicon=labelLexicon) # Testing. if args.test: log.info("Reading test dataset...") testIterator = SyncBatchIterator(TokenLabelPerLineReader(args.test, labelTknSep='\t'), inputGenerators, [outputGenerator], sys.maxint, shuffle=False) log.info("Testing...") wnnModel.test(testIterator) log.info("Done!")
def mainWnn(args): ################################################ # Initializing parameters ############################################## log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters", "alg", "hidden_activation_function", "word_window_size", "char_window_size", "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size", "use_capitalization", "start_symbol", "end_symbol", "with_hidden"} # Load parameters of the saving model if args.load_model: persistentManager = H5py(args.load_model) savedParameters = json.loads(persistentManager.getAttribute("parameters")) if savedParameters.get("charwnn_filters", None) != None: savedParameters["char_filters"] = savedParameters["charwnn_filters"] savedParameters.pop("charwnn_filters") print savedParameters log.info("Loading parameters of the model") args = args._replace(**savedParameters) log.info(str(args)) # Read the parameters lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization.lower() if args.normalization is not None else None wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size hiddenActFunctionName = args.hidden_activation_function embeddingSize = args.word_emb_size withCharWNN = args.with_charwnn charEmbeddingSize = args.char_emb_size charWindowSize = args.char_window_size startSymbolChar = "</s>" suffixEmbSize = args.suffix_emb_size capEmbSize = args.cap_emb_size useSuffixFeatures = args.suffix_size > 0 useCapFeatures = args.use_capitalization # Insert the character that will be used to fill the matrix # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication. artificialChar = "ART_CHAR" # TODO: the maximum number of characters of word is fixed in 20. numMaxChar = 20 if args.alg == "window_stn": isSentenceModel = True elif args.alg == "window_word": isSentenceModel = False else: raise Exception("The value of model_type isn't valid.") batchSize = -1 if isSentenceModel else args.batch_size wordFilters = [] # Lendo Filtros do wnn log.info("Lendo filtros básicos") wordFilters = getFilters(args.word_filters, log) # Lendo Filtros do charwnn log.info("Lendo filtros do charwnn") charFilters = getFilters(args.char_filters, log) # Lendo Filtros do suffix log.info("Lendo filtros do sufixo") suffixFilters = getFilters(args.suffix_filters, log) # Lendo Filtros da capitalização log.info("Lendo filtros da capitalização") capFilters = getFilters(args.cap_filters, log) ################################################ # Create the lexicon and go out after this ################################################ if args.create_only_lexicon: inputGenerators = [] lexiconsToSave = [] if args.word_lexicon and not os.path.exists(args.word_lexicon): wordLexicon = Lexicon("UUUNKKK", "labelLexicon") inputGenerators.append( WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)) lexiconsToSave.append((wordLexicon, args.word_lexicon)) if not os.path.exists(args.label_file): labelLexicon = Lexicon(None, "labelLexicon") outputGenerator = [LabelGenerator(labelLexicon)] lexiconsToSave.append((labelLexicon, args.label_file)) else: outputGenerator = None if args.char_lexicon and not os.path.exists(args.char_lexicon): charLexicon = Lexicon("UUUNKKK", "charLexicon") charLexicon.put(startSymbolChar) charLexicon.put(artificialChar) inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) lexiconsToSave.append((charLexicon, args.char_lexicon)) if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon): suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon") if args.suffix_size <= 0: raise Exception( "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.") inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) lexiconsToSave.append((suffixLexicon, args.suffix_lexicon)) if args.cap_lexicon and not os.path.exists(args.cap_lexicon): capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon") inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) lexiconsToSave.append((capLexicon, args.cap_lexicon)) if len(inputGenerators) == 0: inputGenerators = None if not (inputGenerators or outputGenerator): log.info("All lexicons have been generated.") return trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize, shuffle=shuffle) for lexicon, pathToSave in lexiconsToSave: lexicon.save(pathToSave) log.info("Lexicons were generated with success!") return ################################################ # Starting training ########################################### if withCharWNN and (useSuffixFeatures or useCapFeatures): raise Exception("It's impossible to use hand-crafted features with Charwnn.") # Read word lexicon and create word embeddings if args.load_model: wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon") vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer") wordEmbedding = Embedding(wordLexicon, vectors) elif args.word_embedding: wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon") elif args.word_lexicon: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon") wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize) else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Read char lexicon and create char embeddings if withCharWNN: if args.load_model: charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "char_convolution_layer") charEmbedding = Embedding(charLexicon, vectors) elif args.char_lexicon: charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon") charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize) else: log.error("You need to set one of these parameters: load_model or char_lexicon") return else: # Read suffix lexicon if suffix size is greater than 0 if useSuffixFeatures: if args.load_model: suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "suffix_embedding") suffixEmbedding = Embedding(suffixLexicon, vectors) elif args.suffix_lexicon: suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon") suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize) else: log.error("You need to set one of these parameters: load_model or suffix_lexicon") return # Read capitalization lexicon if useCapFeatures: if args.load_model: capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "cap_embedding") capEmbedding = Embedding(capLexicon, vectors) elif args.cap_lexicon: capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon") capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize) else: log.error("You need to set one of these parameters: load_model or cap_lexicon") return # Read labels if args.load_model: labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon") elif args.label_file: labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon") else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Normalize the word embedding if not normalizeMethod: pass elif normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() else: log.error("Unknown normalization method: %s" % normalizeMethod) sys.exit(1) if normalizeMethod is not None and args.load_model is not None: log.warn("The word embedding of model was normalized. This can change the result of test.") # Build neural network if isSentenceModel: raise NotImplementedError("Sentence model is not implemented!") else: wordWindow = T.lmatrix("word_window") inputModel = [wordWindow] wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True, name="word_embedding_layer") flatten = FlattenLayer(wordEmbeddingLayer) if withCharWNN: # Use the convolution log.info("Using charwnn") convSize = args.conv_size if args.charwnn_with_act: charAct = tanh else: charAct = None charWindowIdxs = T.ltensor4(name="char_window_idx") inputModel.append(charWindowIdxs) charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(), numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct, name="char_convolution_layer") layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer]) sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize) elif useSuffixFeatures or useCapFeatures: # Use hand-crafted features concatenateInputs = [flatten] nmFetauresByWord = wordEmbedding.getEmbeddingSize() if useSuffixFeatures: log.info("Using suffix features") suffixInput = T.lmatrix("suffix_input") suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(), name="suffix_embedding") suffixFlatten = FlattenLayer(suffixEmbLayer) concatenateInputs.append(suffixFlatten) nmFetauresByWord += suffixEmbedding.getEmbeddingSize() inputModel.append(suffixInput) if useCapFeatures: log.info("Using capitalization features") capInput = T.lmatrix("capitalization_input") capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(), name="cap_embedding") capFlatten = FlattenLayer(capEmbLayer) concatenateInputs.append(capFlatten) nmFetauresByWord += capEmbedding.getEmbeddingSize() inputModel.append(capInput) layerBeforeLinear = ConcatenateLayer(concatenateInputs) sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord else: # Use only the word embeddings layerBeforeLinear = flatten sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize() # The rest of the NN if args.with_hidden: hiddenActFunction = method_name(hiddenActFunctionName) weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform() linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize, weightInitialization=weightInit, name="linear1") act1 = ActivationLayer(linear1, hiddenActFunction) layerBeforeSoftmax = act1 sizeLayerBeforeSoftmax = hiddenLayerSize log.info("Using hidden layer") else: layerBeforeSoftmax = layerBeforeLinear sizeLayerBeforeSoftmax = sizeLayerBeforeLinear log.info("Not using hidden layer") linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator(), name="linear_softmax") act2 = ActivationLayer(linear2, softmax) prediction = ArgmaxPrediction(1).predict(act2.getOutput()) # Load the model if args.load_model: alreadyLoaded = set([wordEmbeddingLayer]) for o in (act2.getLayerSet() - alreadyLoaded): if o.getName(): persistentManager.load(o) # Set the input and output inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)] if withCharWNN: inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) else: if useSuffixFeatures: inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) if useCapFeatures: inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) outputGenerator = LabelGenerator(labelLexicon) if args.train: log.info("Reading training examples") trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize, shuffle=shuffle) # Get dev inputs and output dev = args.dev if dev: log.info("Reading development examples") devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator) devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) else: devReader = None else: trainReader = None devReader = None y = T.lvector("y") if args.decay.lower() == "normal": decay = 0.0 elif args.decay.lower() == "divide_epoch": decay = 1.0 if args.adagrad: log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) else: log.info("Using SGD") opt = SGD(lr=lr, decay=decay) # Printing embedding information dictionarySize = wordEmbedding.getNumberOfVectors() log.info("Size of word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize)) if withCharWNN: log.info("Size of char dictionary and char embedding size: %d and %d" % ( charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize())) if useSuffixFeatures: log.info("Size of suffix dictionary and suffix embedding size: %d and %d" % ( suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize())) if useCapFeatures: log.info("Size of capitalization dictionary and capitalization embedding size: %d and %d" % ( capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize())) # Compiling loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y) if args.lambda_L2: _lambda = args.lambda_L2 log.info("Using L2 with lambda= %.2f", _lambda) loss += _lambda * (T.sum(T.square(linear1.getParameters()[0]))) trainMetrics = [ LossMetric("LossTrain", loss, True), AccuracyMetric("AccTrain", y, prediction), ] evalMetrics = [ LossMetric("LossDev", loss, True), AccuracyMetric("AccDev", y, prediction), ] testMetrics = [ LossMetric("LossTest", loss, True), AccuracyMetric("AccTest", y, prediction), ] wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None) # Training if trainReader: callback = [] if args.save_model: savePath = args.save_model objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon] if withCharWNN: objsToSave.append(charLexicon) if useSuffixFeatures: objsToSave.append(suffixLexicon) if useCapFeatures: objsToSave.append(capLexicon) modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad) # Save the model with best acc in dev if args.save_by_acc: callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True)) log.info("Training") wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback) # Save the model at the end of training if args.save_model and not args.save_by_acc: modelWriter.save() # Testing if args.test: log.info("Reading test examples") testDatasetReader = TokenLabelReader(args.test, args.token_label_separator) testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) log.info("Testing") wnnModel.test(testReader) if args.print_prediction: f = codecs.open(args.print_prediction, "w", encoding="utf-8") for x, labels in testReader: inputs = x predictions = wnnModel.prediction(inputs) for prediction in predictions: f.write(labelLexicon.getLexicon(prediction)) f.write("\n")