예제 #1
0
def main():
    full_path = os.path.realpath(__file__)
    path, filename = os.path.split(full_path)
    logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={})
    log = logging.getLogger(__name__)

    if len(sys.argv) != 2:
        log.error("Missing argument: <JSON config file>")
        exit(1)

    argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1])
    args = dict2obj(argsDict, 'ShortDocArguments')
    logging.getLogger(__name__).info(argsDict)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None

    wordEmbedding = None
    if args.word_embedding:
        log.info("Reading W2v File")
        (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__")
        wordLexicon.stopAdd()
    elif args.word_lexicon and args.word_emb_size:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False)
        wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size)
        wordLexicon.stopAdd()
    else:
        log.error("You must provide argument word_embedding or word_lexicon and word_emb_size")

    # Create the lexicon of labels.
    labelLexicon = None
    if args.labels is not None:
        if args.label_lexicon is not None:
            log.error("Only one of the parameters label_lexicon and labels can be provided!")
            exit(1)
        labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False)
    elif args.label_lexicon is not None:
        labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False)
    else:
        log.error("One of the parameters label_lexicon or labels must be provided!")
        exit(1)

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = tensor.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = tensor.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable)

    # if not args.train and args.load_wordEmbedding:
    #     attrs = np.load(args.load_wordEmbedding)
    #     embeddingLayer.load(attrs)
    #     log.info("Loaded word embedding (shape %s) from file %s" % (
    #         str(attrs[0].shape), args.load_wordEmbedding))

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convW = None
    convb = None

    if not args.train and args.load_conv:
        convNPY = np.load(args.load_conv)
        convW = convNPY[0]
        convb = convNPY[1]
        log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv))

    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize, W=convW, b=convb,
                             weightInitialization=weightInit)

    if args.conv_act:
        convOut = ActivationLayer(convLinear, tanh)
    else:
        convOut = convLinear

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convOut)

    # Hidden layer.
    if not args.train and args.load_hiddenLayer:
        hiddenNPY = np.load(args.load_hiddenLayer)
        W1 = hiddenNPY[0]
        b1 = hiddenNPY[1]
        log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer))

    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1, b=b1,
                               weightInitialization=weightInit)

    hiddenAct = ActivationLayer(hiddenLinear, tanh)

    # Entrada linear da camada softmax.
    if not args.train and args.load_softmax:
        hiddenNPY = np.load(args.load_softmax)
        W2 = hiddenNPY[0]
        b2 = hiddenNPY[1]
        log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax))

    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2, b=b2,
                                    weightInitialization=ZeroWeightGenerator())

    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen():
        log.error("Number of label weights (%d) is different from number of labels (%d)!" % (
            len(args.label_weights), labelLexicon.getLen()))
    nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights)
    loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel)

    # Input generators: word window.
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)]

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]
    # outputGenerators = [lambda label: labelLexicon.put(label)]

    evalPerIteration = None
    if args.train:
        trainDatasetReader = ShortDocReader(args.train)
        if args.load_method == "sync":
            log.info("Reading training examples...")
            trainIterator = SyncBatchIterator(trainDatasetReader,
                                              inputGenerators,
                                              outputGenerators,
                                              - 1,
                                              shuffle=shuffle)
            wordLexicon.stopAdd()
        elif args.load_method == "async":
            log.info("Examples will be asynchronously loaded.")
            trainIterator = AsyncBatchIterator(trainDatasetReader,
                                               inputGenerators,
                                               outputGenerators,
                                               - 1,
                                               shuffle=shuffle,
                                               maxqSize=1000)
        else:
            log.error("The argument 'load_method' has an invalid value: %s." % args.load_method)
            sys.exit(1)

        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error("Argument eval_per_iteration cannot be used without a dev argument.")
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = ShortDocReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            - 1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    decay = None
    if args.decay == "none":
        decay = 0.0
    elif args.decay == "linear":
        decay = 1.0
    else:
        log.error("Unknown decay parameter %s." % args.decay)
        exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error("Unknown algorithm: %s." % args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction),
            FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction),
            FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       trainMetrics=trainMetrics,
                       evalMetrics=evalMetrics,
                       testMetrics=testMetrics,
                       mode=mode)

    # Training
    if trainIterator:
        log.info("Training")
        model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration)

    # Saving model after training
        if args.save_wordEmbedding:
            embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon)
            log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding))
        if args.save_conv:
            convLinear.save(args.save_conv)
            log.info("Saved convolution layer to file: %s" % (args.save_conv))
        if args.save_hiddenLayer:
            hiddenLinear.save(args.save_hiddenLayer)
            log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer))
        if args.save_softmax:
            sotmaxLinearInput.save(args.save_softmax)
            log.info("Saved softmax to file: %s" % (args.save_softmax))

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = ShortDocReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         - 1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
예제 #2
0
def main():
    full_path = os.path.realpath(__file__)
    path, filename = os.path.split(full_path)
    logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={})
    log = logging.getLogger(__name__)

    if len(sys.argv) != 3:
        log.error("Missing argument: <JSON config file> or/and <Input file>")
        exit(1)

    argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1])
    args = dict2obj(argsDict, 'ShortDocArguments')
    logging.getLogger(__name__).info(argsDict)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None

    wordEmbedding = None
    if args.word_embedding:
        log.info("Reading W2v File")
        (wordLexicon,
         wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding,
                                                 unknownSymbol="__UNKNOWN__")
        wordLexicon.stopAdd()
    elif args.word_lexicon and args.word_emb_size:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon,
                                           hasUnknowSymbol=False)
        wordEmbedding = Embedding(wordLexicon,
                                  embeddingSize=args.word_emb_size)
        wordLexicon.stopAdd()
    else:
        log.error(
            "You must provide argument word_embedding or word_lexicon and word_emb_size"
        )

    # Create the lexicon of labels.
    labelLexicon = None
    if args.labels is not None:
        if args.label_lexicon is not None:
            log.error(
                "Only one of the parameters label_lexicon and labels can be provided!"
            )
            exit(1)
        labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False)
    elif args.label_lexicon is not None:
        labelLexicon = Lexicon.fromTextFile(args.label_lexicon,
                                            hasUnknowSymbol=False)
    else:
        log.error(
            "One of the parameters label_lexicon or labels must be provided!")
        exit(1)

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = tensor.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = tensor.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # if not args.train and args.load_wordEmbedding:
    #     attrs = np.load(args.load_wordEmbedding)
    #     embeddingLayer.load(attrs)
    #     log.info("Loaded word embedding (shape %s) from file %s" % (
    #         str(attrs[0].shape), args.load_wordEmbedding))

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convW = None
    convb = None

    if not args.train and args.load_conv:
        convNPY = np.load(args.load_conv)
        convW = convNPY[0]
        convb = convNPY[1]
        log.info("Loaded convolutional layer (shape %s) from file %s" %
                 (str(convW.shape), args.load_conv))

    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=convW,
                             b=convb,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # Hidden layer.
    if not args.train and args.load_hiddenLayer:
        hiddenNPY = np.load(args.load_hiddenLayer)
        W1 = hiddenNPY[0]
        b1 = hiddenNPY[1]
        log.info("Loaded hidden layer (shape %s) from file %s" %
                 (str(W1.shape), args.load_hiddenLayer))

    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1,
                               b=b1,
                               weightInitialization=weightInit)

    hiddenAct = ActivationLayer(hiddenLinear, tanh)

    # Entrada linear da camada softmax.
    if not args.train and args.load_softmax:
        hiddenNPY = np.load(args.load_softmax)
        W2 = hiddenNPY[0]
        b2 = hiddenNPY[1]
        log.info("Loaded softmax layer (shape %s) from file %s" %
                 (str(W2.shape), args.load_softmax))

    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())

    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    if args.label_weights is not None and len(
            args.label_weights) != labelLexicon.getLen():
        log.error(
            "Number of label weights (%d) is different from number of labels (%d)!"
            % (len(args.label_weights), labelLexicon.getLen()))
    nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights)
    loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction,
                                outLabel)

    # Input generators: word window.
    inputGenerators = [
        WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol,
                            endSymbol)
    ]

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]
    # outputGenerators = [lambda label: labelLexicon.put(label)]

    evalPerIteration = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    decay = None
    if args.decay == "none":
        decay = 0.0
    elif args.decay == "linear":
        decay = 1.0
    else:
        log.error("Unknown decay parameter %s." % args.decay)
        exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error("Unknown algorithm: %s." % args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       mode=mode)

    wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters,
                                     startSymbol, endSymbol)

    # GETS HIDDEN LAYER:
    # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow)

    # GRAPH FOR PREDICTION LAYER
    graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode)

    lblTxt = ["Sim", "Nao"]

    tweets = []
    with open(sys.argv[2]) as inputFile:
        content = inputFile.readlines()
    for line in content:
        tweets.append(line.decode('utf-8').encode('utf-8'))
    #print tweets
    # graph.getResultsFor(t) retorna a predição para dado Tweet t
    try:
        output_file = open("Output.txt", "w")
    except:
        print "Falha em criar o arquivo de saida\n"
    try:
        for t in tweets:
            output_file.write(
                t.replace('\n', '').replace('\t', '') + "\t " +
                lblTxt[graph.getResultsFor(t)] + "\n")
        print "Resultados gerados com sucesso!\n"
    except:
        print "Erro na geração de resultados\n"
예제 #3
0
def mainWnnNer(args):
    # Initializing parameters.
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    log.info({"type": "args", "args": args})

    # GPU configuration.
    log.info({"floatX": str(theano.config.floatX), "device": str(theano.config.device)})

    # Parameters.
    # lr = args.lr
    # startSymbol = args.start_symbol
    # endSymbol = args.end_symbol
    # numEpochs = args.num_epochs
    # shuffle = args.shuffle
    # normalization = args.normalization
    # wordWindowSize = args.word_window_size
    # hiddenLayerSize = args.hidden_size
    # hiddenActFunctionName = args.hidden_activation_function
    # embeddingSize = args.word_emb_size
    # batchSize = args.batch_size
    # structGrad = args.struct_grad
    # charStructGrad = args.char_struct_grad
    #
    # charEmbeddingSize = args.char_emb_size
    # charWindowSize = args.char_window_size
    # charConvSize = args.conv_size

    # Word filters.
    log.info("Loading word filters...")
    wordFilters = getFilters(args.word_filters, log)

    # Loading/creating word lexicon and word embedding.
    if args.word_embedding is not None:
        log.info("Loading word embedding...")
        wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon")
    elif args.word_lexicon is not None:
        log.info("Loading word lexicon...")
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon")
        wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=args.word_emb_size)
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        sys.exit(1)

    # Loading char lexicon.
    log.info("Loading char lexicon...")
    charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon")

    # Character embedding.
    charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=args.char_emb_size)

    # Loading label lexicon.
    log.info("Loading label lexicon...")
    labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon")

    # Normalize the word embedding
    if args.normalization is not None:
        normFactor = 1
        if args.norm_factor is not None:
            normFactor = args.norm_factor

        if args.normalization == "minmax":
            log.info("Normalizing word embedding: minmax")
            wordEmbedding.minMaxNormalization(norm_coef=normFactor)
        elif args.normalization == "mean":
            log.info("Normalizing word embedding: mean")
            wordEmbedding.meanNormalization(norm_coef=normFactor)
        else:
            log.error("Unknown normalization method: %s" % args.normalization)
            sys.exit(1)
    elif args.normFactor is not None:
        log.error("Parameter norm_factor cannot be present without normalization.")
        sys.exit(1)

    dictionarySize = wordEmbedding.getNumberOfVectors()
    log.info("Size of word lexicon is %d and word embedding size is %d" % (dictionarySize, args.word_emb_size))

    # Setup the input and (golden) output generators (readers).
    inputGenerators = [
        WordWindowGenerator(args.word_window_size, wordLexicon, wordFilters, args.start_symbol, args.end_symbol),
        CharacterWindowGenerator(lexicon=charLexicon, numMaxChar=20, charWindowSize=args.char_window_size,
                                 wrdWindowSize=args.word_window_size, artificialChar="ART_CHAR", startPadding="</s>",
                                 startPaddingWrd=args.start_symbol, endPaddingWrd=args.end_symbol,
                                 filters=getFilters([], log))
    ]
    outputGenerator = LabelGenerator(labelLexicon)

    if args.cv is not None:
        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators,
                                          [outputGenerator], args.batch_size, shuffle=args.shuffle,
                                          numCVFolds=args.cv.numFolds)
        cvGenerators = trainIterator.getCVGenerators()
        iFold = 0
        numFolds = len(cvGenerators)
        for train, dev in cvGenerators:
            log.info({"cv": {"fold": iFold, "numFolds": numFolds}})
            trainNetwork(args, log, trainIterator=train, devIterator=dev, wordEmbedding=wordEmbedding,
                         charEmbedding=charEmbedding, borrow=False, labelLexicon=labelLexicon)
    else:
        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators,
                                          [outputGenerator], args.batch_size, shuffle=args.shuffle)

        # Get dev inputs and (golden) outputs.
        devIterator = None
        if args.dev is not None:
            log.info("Reading development examples")
            devIterator = SyncBatchIterator(TokenLabelPerLineReader(args.dev, labelTknSep='\t'), inputGenerators,
                                            [outputGenerator], sys.maxint, shuffle=False)

        trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow=True,
                     labelLexicon=labelLexicon)

    # Testing.
    if args.test:
        log.info("Reading test dataset...")
        testIterator = SyncBatchIterator(TokenLabelPerLineReader(args.test, labelTknSep='\t'), inputGenerators,
                                         [outputGenerator], sys.maxint, shuffle=False)

        log.info("Testing...")
        wnnModel.test(testIterator)

    log.info("Done!")
예제 #4
0
def mainWnn(args):
    ################################################
    # Initializing parameters
    ##############################################
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters",
                              "alg", "hidden_activation_function", "word_window_size", "char_window_size",
                              "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size",
                              "use_capitalization", "start_symbol", "end_symbol", "with_hidden"}

    # Load parameters of the saving model
    if args.load_model:
        persistentManager = H5py(args.load_model)
        savedParameters = json.loads(persistentManager.getAttribute("parameters"))

        if savedParameters.get("charwnn_filters", None) != None:
            savedParameters["char_filters"] = savedParameters["charwnn_filters"]
            savedParameters.pop("charwnn_filters")
            print savedParameters

        log.info("Loading parameters of the model")
        args = args._replace(**savedParameters)

    log.info(str(args))

    # Read the parameters
    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization.lower() if args.normalization is not None else None
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    hiddenActFunctionName = args.hidden_activation_function
    embeddingSize = args.word_emb_size

    withCharWNN = args.with_charwnn
    charEmbeddingSize = args.char_emb_size
    charWindowSize = args.char_window_size
    startSymbolChar = "</s>"

    suffixEmbSize = args.suffix_emb_size
    capEmbSize = args.cap_emb_size

    useSuffixFeatures = args.suffix_size > 0
    useCapFeatures = args.use_capitalization

    # Insert the character that will be used to fill the matrix
    # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication.
    artificialChar = "ART_CHAR"

    # TODO: the maximum number of characters of word is fixed in 20.
    numMaxChar = 20

    if args.alg == "window_stn":
        isSentenceModel = True
    elif args.alg == "window_word":
        isSentenceModel = False
    else:
        raise Exception("The value of model_type isn't valid.")

    batchSize = -1 if isSentenceModel else args.batch_size
    wordFilters = []

    # Lendo Filtros do wnn
    log.info("Lendo filtros básicos")
    wordFilters = getFilters(args.word_filters, log)

    # Lendo Filtros do charwnn
    log.info("Lendo filtros do charwnn")
    charFilters = getFilters(args.char_filters, log)

    # Lendo Filtros do suffix
    log.info("Lendo filtros do sufixo")
    suffixFilters = getFilters(args.suffix_filters, log)

    # Lendo Filtros da capitalização
    log.info("Lendo filtros da capitalização")
    capFilters = getFilters(args.cap_filters, log)

    ################################################
    # Create the lexicon and go out after this
    ################################################
    if args.create_only_lexicon:
        inputGenerators = []
        lexiconsToSave = []

        if args.word_lexicon and not os.path.exists(args.word_lexicon):
            wordLexicon = Lexicon("UUUNKKK", "labelLexicon")

            inputGenerators.append(
                WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol))
            lexiconsToSave.append((wordLexicon, args.word_lexicon))

        if not os.path.exists(args.label_file):
            labelLexicon = Lexicon(None, "labelLexicon")
            outputGenerator = [LabelGenerator(labelLexicon)]
            lexiconsToSave.append((labelLexicon, args.label_file))
        else:
            outputGenerator = None

        if args.char_lexicon and not os.path.exists(args.char_lexicon):
            charLexicon = Lexicon("UUUNKKK", "charLexicon")

            charLexicon.put(startSymbolChar)
            charLexicon.put(artificialChar)

            inputGenerators.append(
                CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                         startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                         filters=charFilters))

            lexiconsToSave.append((charLexicon, args.char_lexicon))

        if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon):
            suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon")

            if args.suffix_size <= 0:
                raise Exception(
                    "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.")

            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

            lexiconsToSave.append((suffixLexicon, args.suffix_lexicon))

        if args.cap_lexicon and not os.path.exists(args.cap_lexicon):
            capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon")

            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

            lexiconsToSave.append((capLexicon, args.cap_lexicon))

        if len(inputGenerators) == 0:
            inputGenerators = None

        if not (inputGenerators or outputGenerator):
            log.info("All lexicons have been generated.")
            return

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize,
                                        shuffle=shuffle)

        for lexicon, pathToSave in lexiconsToSave:
            lexicon.save(pathToSave)

        log.info("Lexicons were generated with success!")

        return

    ################################################
    # Starting training
    ###########################################

    if withCharWNN and (useSuffixFeatures or useCapFeatures):
        raise Exception("It's impossible to use hand-crafted features with Charwnn.")

    # Read word lexicon and create word embeddings
    if args.load_model:
        wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon")
        vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer")

        wordEmbedding = Embedding(wordLexicon, vectors)

    elif args.word_embedding:
        wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon")
    elif args.word_lexicon:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon")
        wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize)
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Read char lexicon and create char embeddings
    if withCharWNN:
        if args.load_model:
            charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon")
            vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                     "char_convolution_layer")

            charEmbedding = Embedding(charLexicon, vectors)
        elif args.char_lexicon:
            charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon")
            charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize)
        else:
            log.error("You need to set one of these parameters: load_model or char_lexicon")
            return
    else:
        # Read suffix lexicon if suffix size is greater than 0
        if useSuffixFeatures:
            if args.load_model:
                suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "suffix_embedding")

                suffixEmbedding = Embedding(suffixLexicon, vectors)
            elif args.suffix_lexicon:
                suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon")
                suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or suffix_lexicon")
                return

        # Read capitalization lexicon
        if useCapFeatures:
            if args.load_model:
                capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "cap_embedding")

                capEmbedding = Embedding(capLexicon, vectors)
            elif args.cap_lexicon:
                capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon")
                capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or cap_lexicon")
                return

    # Read labels
    if args.load_model:
        labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon")
    elif args.label_file:
        labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon")
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Normalize the word embedding
    if not normalizeMethod:
        pass
    elif normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    else:
        log.error("Unknown normalization method: %s" % normalizeMethod)
        sys.exit(1)

    if normalizeMethod is not None and args.load_model is not None:
        log.warn("The word embedding of model was normalized. This can change the result of test.")

    # Build neural network
    if isSentenceModel:
        raise NotImplementedError("Sentence model is not implemented!")
    else:
        wordWindow = T.lmatrix("word_window")
        inputModel = [wordWindow]

        wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True,
                                            name="word_embedding_layer")
        flatten = FlattenLayer(wordEmbeddingLayer)

        if withCharWNN:
            # Use the convolution
            log.info("Using charwnn")
            convSize = args.conv_size

            if args.charwnn_with_act:
                charAct = tanh
            else:
                charAct = None

            charWindowIdxs = T.ltensor4(name="char_window_idx")
            inputModel.append(charWindowIdxs)

            charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(),
                                                                 numMaxChar, convSize, charWindowSize,
                                                                 charEmbeddingSize, charAct,
                                                                 name="char_convolution_layer")
            layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer])
            sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize)
        elif useSuffixFeatures or useCapFeatures:
            # Use hand-crafted features
            concatenateInputs = [flatten]
            nmFetauresByWord = wordEmbedding.getEmbeddingSize()

            if useSuffixFeatures:
                log.info("Using suffix features")

                suffixInput = T.lmatrix("suffix_input")
                suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(),
                                                name="suffix_embedding")
                suffixFlatten = FlattenLayer(suffixEmbLayer)
                concatenateInputs.append(suffixFlatten)

                nmFetauresByWord += suffixEmbedding.getEmbeddingSize()
                inputModel.append(suffixInput)

            if useCapFeatures:
                log.info("Using capitalization features")

                capInput = T.lmatrix("capitalization_input")
                capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(),
                                             name="cap_embedding")
                capFlatten = FlattenLayer(capEmbLayer)
                concatenateInputs.append(capFlatten)

                nmFetauresByWord += capEmbedding.getEmbeddingSize()
                inputModel.append(capInput)

            layerBeforeLinear = ConcatenateLayer(concatenateInputs)
            sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord
        else:
            # Use only the word embeddings
            layerBeforeLinear = flatten
            sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize()

        # The rest of the NN
        if args.with_hidden:
            hiddenActFunction = method_name(hiddenActFunctionName)
            weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform()

            linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize,
                                  weightInitialization=weightInit, name="linear1")
            act1 = ActivationLayer(linear1, hiddenActFunction)

            layerBeforeSoftmax = act1
            sizeLayerBeforeSoftmax = hiddenLayerSize
            log.info("Using hidden layer")
        else:
            layerBeforeSoftmax = layerBeforeLinear
            sizeLayerBeforeSoftmax = sizeLayerBeforeLinear
            log.info("Not using hidden layer")

        linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(),
                              weightInitialization=ZeroWeightGenerator(),
                              name="linear_softmax")
        act2 = ActivationLayer(linear2, softmax)
        prediction = ArgmaxPrediction(1).predict(act2.getOutput())

    # Load the model
    if args.load_model:
        alreadyLoaded = set([wordEmbeddingLayer])

        for o in (act2.getLayerSet() - alreadyLoaded):
            if o.getName():
                persistentManager.load(o)

    # Set the input and output
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)]

    if withCharWNN:
        inputGenerators.append(
            CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                     startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                     filters=charFilters))
    else:
        if useSuffixFeatures:
            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

        if useCapFeatures:
            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

    outputGenerator = LabelGenerator(labelLexicon)

    if args.train:
        log.info("Reading training examples")

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize,
                                        shuffle=shuffle)

        # Get dev inputs and output
        dev = args.dev

        if dev:
            log.info("Reading development examples")
            devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator)
            devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint,
                                          shuffle=False)
        else:
            devReader = None
    else:
        trainReader = None
        devReader = None

    y = T.lvector("y")

    if args.decay.lower() == "normal":
        decay = 0.0
    elif args.decay.lower() == "divide_epoch":
        decay = 1.0

    if args.adagrad:
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    else:
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)

    # Printing embedding information
    dictionarySize = wordEmbedding.getNumberOfVectors()

    log.info("Size of  word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize))

    if withCharWNN:
        log.info("Size of  char dictionary and char embedding size: %d and %d" % (
            charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize()))

    if useSuffixFeatures:
        log.info("Size of  suffix dictionary and suffix embedding size: %d and %d" % (
            suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize()))

    if useCapFeatures:
        log.info("Size of  capitalization dictionary and capitalization embedding size: %d and %d" % (
            capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize()))

    # Compiling
    loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y)

    if args.lambda_L2:
        _lambda = args.lambda_L2
        log.info("Using L2 with lambda= %.2f", _lambda)
        loss += _lambda * (T.sum(T.square(linear1.getParameters()[0])))

    trainMetrics = [
        LossMetric("LossTrain", loss, True),
        AccuracyMetric("AccTrain", y, prediction),
    ]

    evalMetrics = [
        LossMetric("LossDev", loss, True),
        AccuracyMetric("AccDev", y, prediction),
    ]

    testMetrics = [
        LossMetric("LossTest", loss, True),
        AccuracyMetric("AccTest", y, prediction),
    ]

    wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics,
                          evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None)
    # Training
    if trainReader:
        callback = []

        if args.save_model:
            savePath = args.save_model
            objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon]

            if withCharWNN:
                objsToSave.append(charLexicon)

            if useSuffixFeatures:
                objsToSave.append(suffixLexicon)

            if useCapFeatures:
                objsToSave.append(capLexicon)

            modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad)

            # Save the model with best acc in dev
            if args.save_by_acc:
                callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True))

        log.info("Training")
        wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback)

        # Save the model at the end of training
        if args.save_model and not args.save_by_acc:
            modelWriter.save()

    # Testing
    if args.test:
        log.info("Reading test examples")
        testDatasetReader = TokenLabelReader(args.test, args.token_label_separator)
        testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False)

        log.info("Testing")
        wnnModel.test(testReader)

        if args.print_prediction:
            f = codecs.open(args.print_prediction, "w", encoding="utf-8")

            for x, labels in testReader:
                inputs = x

                predictions = wnnModel.prediction(inputs)

                for prediction in predictions:
                    f.write(labelLexicon.getLexicon(prediction))
                    f.write("\n")