def __init__(self, lexicon_path, database, cache=None):
        super(DBR_CNN_CategoricalPreprocessor, self).__init__(database, cache)

        lexicons = json.load(open(lexicon_path))
        self.component_lexicon = Lexicon.fromList(lexicons['component'][1:],
                                                  False, 'component')
        self.priorityLexicon = Lexicon.fromList(
            sorted(lexicons['priority'][1:], reverse=True), False, 'priority')
예제 #2
0
    def fromWord2Vec(w2vFile, unknownSymbol, lexiconName=None):
        """
        Creates  a lexicon and a embedding from word2vec file.
        :param w2vFile: path of file
        :param unknownSymbol: the string that represents the unknown words.
        :return: (data.Lexicon.Lexicon, Embedding)
        """
        log = logging.getLogger(__name__)
        fVec = codecs.open(w2vFile, 'r', 'utf-8')

        # Read the number of words in the dictionary and the embedding size
        nmWords, embeddingSizeStr = fVec.readline().strip().split(" ")
        embeddingSize = int(embeddingSizeStr)
        lexicon = Lexicon(unknownSymbol, lexiconName)

        # The empty array represents the array of unknown
        # At end, this array will be replaced by one array that exist in the  w2vFile or a random array.
        vectors = [[]]
        nmEmptyWords = 0

        unknownInsered = False

        for line in fVec:
            splitLine = line.rstrip().split(u' ')
            word = splitLine[0]

            if len(word) == 0:
                log.warning("Insert in the embedding a empty string. This embeddings will be thrown out.")
                nmEmptyWords += 1
                continue

            vec = [float(num) for num in splitLine[1:]]


            if word == unknownSymbol:
                if len(vectors[0]) != 0:
                    raise Exception("A unknown symbol was already inserted.")

                vectors[0] = vec
                unknownInsered = True
            else:
                lexicon.put(word)
                vectors.append(vec)

        if len(vectors[0]) == 0:
            vectors[0] = FeatureVectorsGenerator().generateVector(embeddingSize)

        expected_size = lexicon.getLen() - 1 + nmEmptyWords

        if(unknownInsered):
            expected_size += 1

        if int(nmWords) != expected_size:
            raise Exception("The size of lexicon is different of number of vectors")

        fVec.close()

        return lexicon, Embedding(lexicon, vectors)
def createCategoricalPreprocessorAndLexicons(lexiconPath, bugReportDatabase):
    # Define Filters and set preprocessing steps
    basicFilter = [
        data.preprocessing.TransformLowerCaseFilter(),
    ]

    lexiconJsons = json.load(open(lexiconPath))
    productLexicon = Lexicon.fromList(lexiconJsons['product'], True, 'product')
    severityLexicon = Lexicon.fromList(lexiconJsons['bug_severity'], True,
                                       'bug_severity')
    componentLexicon = Lexicon.fromList(lexiconJsons['component'], True,
                                        'component')
    priorityLexicon = Lexicon.fromList(lexiconJsons['priority'], True,
                                       'priority')

    categoricalArgs = [
        ('product', productLexicon, basicFilter),
        ('bug_severity', severityLexicon, basicFilter),
        ('component', componentLexicon, basicFilter),
        ('priority', priorityLexicon, basicFilter),
        # BasicFieldPreprocessor('version', versionLexicon, basicFilter + [TransformNumberToZeroFilter()]),
    ]

    str = "Field name and Lexicon size: "

    for f, l, _ in categoricalArgs:
        str += "{} {}; ".format(f, l.getLen())

    logging.getLogger().info(str)

    lexicons = [
        productLexicon, severityLexicon, componentLexicon, priorityLexicon
    ]

    return data.preprocessing.CategoricalPreprocessor(
        categoricalArgs, bugReportDatabase), lexicons
def load_embedding(opts, paddingSym):
    if opts["lexicon"]:
        emb = np.load(opts["word_embedding"])

        lexicon = Lexicon(unknownSymbol=None)
        with codecs.open(opts["lexicon"]) as f:
            for l in f:
                lexicon.put(l.strip())

        lexicon.setUnknown("UUUKNNN")
        paddingId = lexicon.getLexiconIndex(paddingSym)
        embedding = Embedding(lexicon, emb, paddingIdx=paddingId)
    elif opts["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(opts['word_embedding'], 'UUUKNNN', hasHeader=False,
                                                paddingSym=paddingSym)

    return lexicon, embedding
def main(**kwargs):
    log = logging.getLogger(__name__)
    log.info(kwargs)

    if kwargs["seed"] != None:
        random.seed(kwargs["seed"])
        np.random.seed(kwargs["seed"])

    filters = []

    for filterName in kwargs["filters"]:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Usando o filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    wordWindowSize = kwargs["word_window_size"]
    hiddenLayerSize = kwargs["hidden_size"]
    batchSize = kwargs["batch_size"]
    startSymbol = kwargs["start_symbol"]
    endSymbol = kwargs["end_symbol"]
    numEpochs = kwargs["num_epochs"]
    lr = kwargs["lr"]
    tagLexicon = createLexiconUsingFile(kwargs["label_file"])
    # _lambda = theano.shared(kwargs["lambda"], "lambda")
    _lambda = theano.shared(0.0, "lambda")
    useAdagrad = kwargs["adagrad"]
    shuffle = kwargs["shuffle"]
    supHiddenLayerSize = kwargs["hidden_size_supervised_part"]
    unsupHiddenLayerSize = kwargs["hidden_size_unsupervised_part"]
    normalization = kwargs["normalization"]
    activationHiddenExtractor = kwargs["activation_hidden_extractor"]

    withCharWNN = kwargs["with_charwnn"]
    convSize = kwargs["conv_size"]
    charEmbeddingSize = kwargs["char_emb_size"]
    charWindowSize = kwargs["char_window_size"]
    startSymbolChar = "</s>"

    if kwargs["charwnn_with_act"]:
        charAct = tanh
    else:
        charAct = None

    # TODO: the maximum number of characters of word is fixed in 20.
    numMaxChar = 20

    if kwargs["decay"].lower() == "normal":
        decay = 0.0
    elif kwargs["decay"].lower() == "divide_epoch":
        decay = 1.0

    # Add the lexicon of target
    domainLexicon = Lexicon()

    domainLexicon.put("0")
    domainLexicon.put("1")
    domainLexicon.stopAdd()

    log.info("Reading W2v File1")
    wordEmbedding = EmbeddingFactory().createFromW2V(kwargs["word_embedding"],
                                                     RandomUnknownStrategy())

    log.info("Reading training examples")
    # Generators
    inputGenerators = [
        WordWindowGenerator(wordWindowSize, wordEmbedding, filters,
                            startSymbol)
    ]
    outputGeneratorTag = LabelGenerator(tagLexicon)

    if withCharWNN:
        # Create the character embedding
        charEmbedding = EmbeddingFactory().createRandomEmbedding(
            charEmbeddingSize)

        # Insert the padding of the character window
        charEmbedding.put(startSymbolChar)

        # Insert the character that will be used to fill the matrix
        # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication.
        artificialChar = "ART_CHAR"
        charEmbedding.put(artificialChar)

        inputGenerators.append(
            CharacterWindowGenerator(charEmbedding,
                                     numMaxChar,
                                     charWindowSize,
                                     wordWindowSize,
                                     artificialChar,
                                     startSymbolChar,
                                     startPaddingWrd=startSymbol,
                                     endPaddingWrd=endSymbol))

    unsupervisedLabelSource = ConstantLabel(domainLexicon, "0")

    # Reading supervised and unsupervised data sets.
    trainSupervisedDatasetReader = TokenLabelReader(
        kwargs["train_source"], kwargs["token_label_separator"])
    trainSupervisedBatch = SyncBatchIterator(
        trainSupervisedDatasetReader,
        inputGenerators, [outputGeneratorTag, unsupervisedLabelSource],
        batchSize[0],
        shuffle=shuffle)

    # Get Unsupervised Input
    unsupervisedLabelTarget = ConstantLabel(domainLexicon, "1")

    trainUnsupervisedDatasetReader = TokenReader(kwargs["train_target"])
    trainUnsupervisedDatasetBatch = SyncBatchIterator(
        trainUnsupervisedDatasetReader,
        inputGenerators, [unsupervisedLabelTarget],
        batchSize[1],
        shuffle=shuffle)

    # Stopping to add new words, labels and chars
    wordEmbedding.stopAdd()
    tagLexicon.stopAdd()
    domainLexicon.stopAdd()

    if withCharWNN:
        charEmbedding.stopAdd()

    # Printing embedding information
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Size of  word dictionary and word embedding size: %d and %d" %
             (dictionarySize, embeddingSize))
    log.info(
        "Size of  char dictionary and char embedding size: %d and %d" %
        (charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize()))

    # Word Embedding Normalization
    if normalization == "zscore":
        wordEmbedding.zscoreNormalization()
    elif normalization == "minmax":
        wordEmbedding.minMaxNormalization()
    elif normalization == "mean":
        wordEmbedding.meanNormalization()
    elif normalization == "none" or not normalization:
        pass
    else:
        raise Exception()

    # Source input
    wordWindowSource = T.lmatrix(name="windowSource")
    sourceInput = [wordWindowSource]

    # Create the layers related with the extractor of features
    embeddingLayerSrc = EmbeddingLayer(wordWindowSource,
                                       wordEmbedding.getEmbeddingMatrix(),
                                       trainable=True)
    flattenSrc = FlattenLayer(embeddingLayerSrc)

    if withCharWNN:
        log.info("Using charwnn")

        # Create the charwn
        charWindowIdxSrc = T.ltensor4(name="char_window_idx_source")
        sourceInput.append(charWindowIdxSrc)

        charEmbeddingConvLayerSrc = EmbeddingConvolutionalLayer(
            charWindowIdxSrc, charEmbedding.getEmbeddingMatrix(), numMaxChar,
            convSize, charWindowSize, charEmbeddingSize, charAct)
        layerBeforeLinearSrc = ConcatenateLayer(
            [flattenSrc, charEmbeddingConvLayerSrc])
        sizeLayerBeforeLinearSrc = wordWindowSize * (
            wordEmbedding.getEmbeddingSize() + convSize)
    else:
        layerBeforeLinearSrc = flattenSrc
        sizeLayerBeforeLinearSrc = wordWindowSize * wordEmbedding.getEmbeddingSize(
        )

    if activationHiddenExtractor == "tanh":
        log.info("Using tanh in the hidden layer of extractor")

        linear1 = LinearLayer(layerBeforeLinearSrc,
                              sizeLayerBeforeLinearSrc,
                              hiddenLayerSize,
                              weightInitialization=GlorotUniform())
        act1 = ActivationLayer(linear1, tanh)
    elif activationHiddenExtractor == "sigmoid":
        log.info("Using sigmoid in the hidden layer of extractor")

        linear1 = LinearLayer(layerBeforeLinearSrc,
                              sizeLayerBeforeLinearSrc,
                              hiddenLayerSize,
                              weightInitialization=SigmoidGenerator())
        act1 = ActivationLayer(linear1, sigmoid)
    else:
        raise Exception()

    # Create the layers with the Tagger
    if supHiddenLayerSize == 0:
        layerBeforeSupSoftmax = act1
        layerSizeBeforeSupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the supervised softmax.")
    else:
        linear2 = LinearLayer(act1,
                              hiddenLayerSize,
                              supHiddenLayerSize,
                              weightInitialization=GlorotUniform())
        act2 = ActivationLayer(linear2, tanh)

        layerBeforeSupSoftmax = act2
        layerSizeBeforeSupSoftmax = supHiddenLayerSize

        log.info("It inserted the layer before the supervised softmax.")

    supervisedLinear = LinearLayer(layerBeforeSupSoftmax,
                                   layerSizeBeforeSupSoftmax,
                                   tagLexicon.getLen(),
                                   weightInitialization=ZeroWeightGenerator())
    supervisedSoftmax = ActivationLayer(supervisedLinear, softmax)

    # Create the layers with the domain classifier
    gradientReversalSource = GradientReversalLayer(act1, _lambda)

    if unsupHiddenLayerSize == 0:
        layerBeforeUnsupSoftmax = gradientReversalSource
        layerSizeBeforeUnsupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the unsupervised softmax.")
    else:
        unsupervisedSourceLinearBf = LinearLayer(
            gradientReversalSource,
            hiddenLayerSize,
            unsupHiddenLayerSize,
            weightInitialization=GlorotUniform())
        actUnsupervisedSourceBf = ActivationLayer(unsupervisedSourceLinearBf,
                                                  tanh)

        layerBeforeUnsupSoftmax = actUnsupervisedSourceBf
        layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize

        log.info("It inserted the layer before the unsupervised softmax.")

    unsupervisedSourceLinear = LinearLayer(
        layerBeforeUnsupSoftmax,
        layerSizeBeforeUnsupSoftmax,
        domainLexicon.getLen(),
        weightInitialization=ZeroWeightGenerator())
    unsupervisedSourceSoftmax = ActivationLayer(unsupervisedSourceLinear,
                                                softmax)

    ## Target Part
    windowTarget = T.lmatrix(name="windowTarget")
    targetInput = [windowTarget]

    # Create the layers related with the extractor of features
    embeddingLayerUnsuper1 = EmbeddingLayer(
        windowTarget, embeddingLayerSrc.getParameters()[0], trainable=True)
    flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1)

    if withCharWNN:
        log.info("Using charwnn")

        # Create the charwn
        charWindowIdxTgt = T.ltensor4(name="char_window_idx_target")
        targetInput.append(charWindowIdxTgt)

        charEmbeddingConvLayerTgt = EmbeddingConvolutionalLayer(
            charWindowIdxTgt,
            charEmbeddingConvLayerSrc.getParameters()[0],
            numMaxChar,
            convSize,
            charWindowSize,
            charEmbeddingSize,
            charAct,
            trainable=True)
        layerBeforeLinearTgt = ConcatenateLayer(
            [flattenUnsuper1, charEmbeddingConvLayerTgt])
        sizeLayerBeforeLinearTgt = wordWindowSize * (
            wordEmbedding.getEmbeddingSize() + convSize)
    else:
        layerBeforeLinearTgt = flattenUnsuper1
        sizeLayerBeforeLinearTgt = wordWindowSize * wordEmbedding.getEmbeddingSize(
        )

    w, b = linear1.getParameters()
    linearUnsuper1 = LinearLayer(layerBeforeLinearTgt,
                                 sizeLayerBeforeLinearTgt,
                                 hiddenLayerSize,
                                 W=w,
                                 b=b,
                                 trainable=True)

    if activationHiddenExtractor == "tanh":
        log.info("Using tanh in the hidden layer of extractor")
        actUnsupervised1 = ActivationLayer(linearUnsuper1, tanh)
    elif activationHiddenExtractor == "sigmoid":
        log.info("Using sigmoid in the hidden layer of extractor")
        actUnsupervised1 = ActivationLayer(linearUnsuper1, sigmoid)
    else:
        raise Exception()

    # Create the layers with the domain classifier
    grandientReversalTarget = GradientReversalLayer(actUnsupervised1, _lambda)

    if unsupHiddenLayerSize == 0:
        layerBeforeUnsupSoftmax = grandientReversalTarget
        layerSizeBeforeUnsupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the unsupervised softmax.")
    else:
        w, b = unsupervisedSourceLinearBf.getParameters()
        unsupervisedTargetLinearBf = LinearLayer(grandientReversalTarget,
                                                 hiddenLayerSize,
                                                 unsupHiddenLayerSize,
                                                 W=w,
                                                 b=b,
                                                 trainable=True)
        actUnsupervisedTargetLinearBf = ActivationLayer(
            unsupervisedTargetLinearBf, tanh)

        layerBeforeUnsupSoftmax = actUnsupervisedTargetLinearBf
        layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize

        log.info("It inserted the layer before the unsupervised softmax.")

    w, b = unsupervisedSourceLinear.getParameters()
    unsupervisedTargetLinear = LinearLayer(layerBeforeUnsupSoftmax,
                                           layerSizeBeforeUnsupSoftmax,
                                           domainLexicon.getLen(),
                                           W=w,
                                           b=b,
                                           trainable=True)
    unsupervisedTargetSoftmax = ActivationLayer(unsupervisedTargetLinear,
                                                softmax)

    # Set loss and prediction and retrieve all layers
    supervisedLabel = T.lvector("supervisedLabel")
    unsupervisedLabelSource = T.lvector("unsupervisedLabelSource")
    unsupervisedLabelTarget = T.lvector("unsupervisedLabelTarget")

    supervisedOutput = supervisedSoftmax.getOutput()
    supervisedPrediction = ArgmaxPrediction(1).predict(supervisedOutput)
    supervisedLoss = NegativeLogLikelihood().calculateError(
        supervisedOutput, supervisedPrediction, supervisedLabel)

    unsupervisedOutputSource = unsupervisedSourceSoftmax.getOutput()
    unsupervisedPredSource = ArgmaxPrediction(1).predict(
        unsupervisedOutputSource)
    unsupervisedLossSource = NegativeLogLikelihood().calculateError(
        unsupervisedOutputSource, None, unsupervisedLabelSource)

    unsupervisedOutputTarget = unsupervisedTargetSoftmax.getOutput()
    unsupervisedPredTarget = ArgmaxPrediction(1).predict(
        unsupervisedOutputTarget)
    unsupervisedLossTarget = NegativeLogLikelihood().calculateError(
        unsupervisedOutputTarget, None, unsupervisedLabelTarget)

    # Creates model

    if useAdagrad:
        log.info("Using ADAGRAD")
        opt = Adagrad(lr=lr, decay=decay)
    else:
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)

    allLayersSource = supervisedSoftmax.getLayerSet(
    ) | unsupervisedSourceSoftmax.getLayerSet()
    allLayersTarget = unsupervisedTargetSoftmax.getLayerSet()
    unsupervisedLossTarget *= float(
        trainSupervisedBatch.size()) / trainUnsupervisedDatasetBatch.size()

    supervisedTrainMetrics = [
        LossMetric("TrainSupervisedLoss", supervisedLoss),
        AccuracyMetric("TrainSupervisedAcc", supervisedLabel,
                       supervisedPrediction),
        LossMetric("TrainUnsupervisedLoss", unsupervisedLossSource),
        AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelSource,
                       unsupervisedPredSource)
    ]
    unsupervisedTrainMetrics = [
        LossMetric("TrainUnsupervisedLoss", unsupervisedLossTarget),
        AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelTarget,
                       unsupervisedPredTarget)
    ]

    evalMetrics = [
        AccuracyMetric("EvalAcc", supervisedLabel, supervisedPrediction)
    ]

    testMetrics = [
        AccuracyMetric("TestAcc", supervisedLabel, supervisedPrediction)
    ]

    #TODO: Não tive tempo de testar o código depois das modificações
    GradientReversalModel(sourceInput,
                          targetInput,
                          supervisedLabel,
                          unsupervisedLabelSource,
                          unsupervisedLabelTarget,
                          allLayersSource,
                          allLayersTarget,
                          opt,
                          supervisedPrediction,
                          supervisedLoss,
                          unsupervisedLossSource,
                          unsupervisedLossTarget,
                          supervisedTrainMetrics,
                          unsupervisedTrainMetrics,
                          evalMetrics,
                          testMetrics,
                          mode=None)

    # Get dev inputs and output
    log.info("Reading development examples")
    devDatasetReader = TokenLabelReader(kwargs["dev"],
                                        kwargs["token_label_separator"])
    devReader = SyncBatchIterator(devDatasetReader,
                                  inputGenerators, [outputGeneratorTag],
                                  sys.maxint,
                                  shuffle=False)

    callbacks = []
    # log.info("Usando lambda fixo: " + str(_lambda.get_value()))
    log.info("Usando lambda variado. alpha=" + str(kwargs["alpha"]) +
             " height=" + str(kwargs["height"]))
    callbacks.append(
        ChangeLambda(_lambda, kwargs["alpha"], numEpochs, kwargs["height"]))

    if kwargs["additional_dev"]:
        callbacks.append(
            AdditionalDevDataset(model, kwargs["additional_dev"],
                                 kwargs["token_label_separator"],
                                 inputGenerators, outputGeneratorTag))

    # Training Model
    model.train([trainSupervisedBatch, trainUnsupervisedDatasetBatch],
                numEpochs,
                devReader,
                callbacks=callbacks)
예제 #6
0
def main():
    full_path = os.path.realpath(__file__)
    path, filename = os.path.split(full_path)
    logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={})
    log = logging.getLogger(__name__)

    if len(sys.argv) != 2:
        log.error("Missing argument: <JSON config file>")
        exit(1)

    argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1])
    args = dict2obj(argsDict, 'ShortDocArguments')
    logging.getLogger(__name__).info(argsDict)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None

    wordEmbedding = None
    if args.word_embedding:
        log.info("Reading W2v File")
        (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__")
        wordLexicon.stopAdd()
    elif args.word_lexicon and args.word_emb_size:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False)
        wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size)
        wordLexicon.stopAdd()
    else:
        log.error("You must provide argument word_embedding or word_lexicon and word_emb_size")

    # Create the lexicon of labels.
    labelLexicon = None
    if args.labels is not None:
        if args.label_lexicon is not None:
            log.error("Only one of the parameters label_lexicon and labels can be provided!")
            exit(1)
        labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False)
    elif args.label_lexicon is not None:
        labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False)
    else:
        log.error("One of the parameters label_lexicon or labels must be provided!")
        exit(1)

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = tensor.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = tensor.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable)

    # if not args.train and args.load_wordEmbedding:
    #     attrs = np.load(args.load_wordEmbedding)
    #     embeddingLayer.load(attrs)
    #     log.info("Loaded word embedding (shape %s) from file %s" % (
    #         str(attrs[0].shape), args.load_wordEmbedding))

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convW = None
    convb = None

    if not args.train and args.load_conv:
        convNPY = np.load(args.load_conv)
        convW = convNPY[0]
        convb = convNPY[1]
        log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv))

    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize, W=convW, b=convb,
                             weightInitialization=weightInit)

    if args.conv_act:
        convOut = ActivationLayer(convLinear, tanh)
    else:
        convOut = convLinear

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convOut)

    # Hidden layer.
    if not args.train and args.load_hiddenLayer:
        hiddenNPY = np.load(args.load_hiddenLayer)
        W1 = hiddenNPY[0]
        b1 = hiddenNPY[1]
        log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer))

    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1, b=b1,
                               weightInitialization=weightInit)

    hiddenAct = ActivationLayer(hiddenLinear, tanh)

    # Entrada linear da camada softmax.
    if not args.train and args.load_softmax:
        hiddenNPY = np.load(args.load_softmax)
        W2 = hiddenNPY[0]
        b2 = hiddenNPY[1]
        log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax))

    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2, b=b2,
                                    weightInitialization=ZeroWeightGenerator())

    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen():
        log.error("Number of label weights (%d) is different from number of labels (%d)!" % (
            len(args.label_weights), labelLexicon.getLen()))
    nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights)
    loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel)

    # Input generators: word window.
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)]

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]
    # outputGenerators = [lambda label: labelLexicon.put(label)]

    evalPerIteration = None
    if args.train:
        trainDatasetReader = ShortDocReader(args.train)
        if args.load_method == "sync":
            log.info("Reading training examples...")
            trainIterator = SyncBatchIterator(trainDatasetReader,
                                              inputGenerators,
                                              outputGenerators,
                                              - 1,
                                              shuffle=shuffle)
            wordLexicon.stopAdd()
        elif args.load_method == "async":
            log.info("Examples will be asynchronously loaded.")
            trainIterator = AsyncBatchIterator(trainDatasetReader,
                                               inputGenerators,
                                               outputGenerators,
                                               - 1,
                                               shuffle=shuffle,
                                               maxqSize=1000)
        else:
            log.error("The argument 'load_method' has an invalid value: %s." % args.load_method)
            sys.exit(1)

        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error("Argument eval_per_iteration cannot be used without a dev argument.")
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = ShortDocReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            - 1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    decay = None
    if args.decay == "none":
        decay = 0.0
    elif args.decay == "linear":
        decay = 1.0
    else:
        log.error("Unknown decay parameter %s." % args.decay)
        exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error("Unknown algorithm: %s." % args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction),
            FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction),
            FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       trainMetrics=trainMetrics,
                       evalMetrics=evalMetrics,
                       testMetrics=testMetrics,
                       mode=mode)

    # Training
    if trainIterator:
        log.info("Training")
        model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration)

    # Saving model after training
        if args.save_wordEmbedding:
            embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon)
            log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding))
        if args.save_conv:
            convLinear.save(args.save_conv)
            log.info("Saved convolution layer to file: %s" % (args.save_conv))
        if args.save_hiddenLayer:
            hiddenLinear.save(args.save_hiddenLayer)
            log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer))
        if args.save_softmax:
            sotmaxLinearInput.save(args.save_softmax)
            log.info("Saved softmax to file: %s" % (args.save_softmax))

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = ShortDocReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         - 1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
예제 #7
0
def main(_run, _config, _seed, _log):
    """

    :param _run:
    :param _config:
    :param _seed:
    :param _log:
    :return:
    """
    """
    Setting and loading parameters
    """
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)
    bugReportDatabase = BugReportDatabase.fromJson(args['bug_database'])
    paddingSym = "</s>"
    batchSize = args['batch_size']

    device = torch.device('cuda' if args['cuda'] else "cpu")

    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # It is the folder where the preprocessed information will be stored.
    cacheFolder = args['cache_folder']

    # Setting the parameter to save and loading parameters
    importantParameters = ['compare_aggregation', 'categorical']
    parametersToSave = dict([(parName, args[parName])
                             for parName in importantParameters])

    if args['load'] is not None:
        mapLocation = (
            lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu'
        modelInfo = torch.load(args['load'], map_location=mapLocation)
        modelState = modelInfo['model']

        for paramName, paramValue in modelInfo['params'].items():
            args[paramName] = paramValue
    else:
        modelState = None

    preprocessors = PreprocessorList()
    inputHandlers = []

    categoricalOpt = args.get('categorical')

    if categoricalOpt is not None and len(categoricalOpt) != 0:
        categoricalEncoder, _, _ = processCategoricalParam(
            categoricalOpt, bugReportDatabase, inputHandlers, preprocessors,
            None, logger)
    else:
        categoricalEncoder = None

    filterInputHandlers = []

    compareAggOpt = args['compare_aggregation']
    databasePath = args['bug_database']

    # Loading word embedding
    if compareAggOpt["lexicon"]:
        emb = np.load(compareAggOpt["word_embedding"])

        lexicon = Lexicon(unknownSymbol=None)
        with codecs.open(compareAggOpt["lexicon"]) as f:
            for l in f:
                lexicon.put(l.strip())

        lexicon.setUnknown("UUUKNNN")
        paddingId = lexicon.getLexiconIndex(paddingSym)
        embedding = Embedding(lexicon, emb, paddingIdx=paddingId)

        logger.info("Lexicon size: %d" % (lexicon.getLen()))
        logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))
    elif compareAggOpt["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(
            compareAggOpt['word_embedding'],
            'UUUKNNN',
            hasHeader=False,
            paddingSym=paddingSym)
        logger.info("Lexicon size: %d" % (lexicon.getLen()))
        logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))
        paddingId = lexicon.getLexiconIndex(paddingSym)
    else:
        embedding = None

    if compareAggOpt["norm_word_embedding"]:
        embedding.zscoreNormalization()

    # Tokenizer
    if compareAggOpt['tokenizer'] == 'default':
        logger.info("Use default tokenizer to tokenize summary information")
        tokenizer = MultiLineTokenizer()
    elif compareAggOpt['tokenizer'] == 'white_space':
        logger.info(
            "Use white space tokenizer to tokenize summary information")
        tokenizer = WhitespaceTokenizer()
    else:
        raise ArgumentError(
            "Tokenizer value %s is invalid. You should choose one of these: default and white_space"
            % compareAggOpt['tokenizer'])

    # Preparing input handlers, preprocessors and cache
    minSeqSize = max(compareAggOpt['aggregate']["window"]
                     ) if compareAggOpt['aggregate']["model"] == "cnn" else -1
    bow = compareAggOpt.get('bow', False)
    freq = compareAggOpt.get('frequency', False) and bow

    logger.info("BoW={} and TF={}".format(bow, freq))

    if compareAggOpt['extractor'] is not None:
        # Use summary and description (concatenated) to address this problem
        logger.info("Using Summary and Description information.")
        # Loading Filters
        extractorFilters = loadFilters(compareAggOpt['extractor']['filters'])

        arguments = (databasePath, compareAggOpt['word_embedding'],
                     str(compareAggOpt['lexicon']), ' '.join(
                         sorted([
                             fil.__class__.__name__ for fil in extractorFilters
                         ])), compareAggOpt['tokenizer'], str(bow), str(freq),
                     SABDEncoderPreprocessor.__name__)

        inputHandlers.append(SABDInputHandler(paddingId, minSeqSize))
        extractorCache = PreprocessingCache(cacheFolder, arguments)

        if bow:
            extractorPreprocessor = SABDBoWPreprocessor(
                lexicon, bugReportDatabase, extractorFilters, tokenizer,
                paddingId, freq, extractorCache)
        else:
            extractorPreprocessor = SABDEncoderPreprocessor(
                lexicon, bugReportDatabase, extractorFilters, tokenizer,
                paddingId, extractorCache)
        preprocessors.append(extractorPreprocessor)

    # Create model
    model = SABD(embedding, categoricalEncoder, compareAggOpt['extractor'],
                 compareAggOpt['matching'], compareAggOpt['aggregate'],
                 compareAggOpt['classifier'], freq)

    if args['loss'] == 'bce':
        logger.info("Using BCE Loss: margin={}".format(args['margin']))
        lossFn = BCELoss()
        lossNoReduction = BCELoss(reduction='none')
        cmp_collate = PairBugCollate(inputHandlers,
                                     torch.float32,
                                     unsqueeze_target=True)
    elif args['loss'] == 'triplet':
        logger.info("Using Triplet Loss: margin={}".format(args['margin']))
        lossFn = TripletLoss(args['margin'])
        lossNoReduction = TripletLoss(args['margin'], reduction='none')
        cmp_collate = TripletBugCollate(inputHandlers)

    model.to(device)

    if modelState:
        model.load_state_dict(modelState)
    """
    Loading the training and validation. Also, it sets how the negative example will be generated.
    """
    # load training
    if args.get('pairs_training'):
        negativePairGenOpt = args.get('neg_pair_generator', )
        trainingFile = args.get('pairs_training')

        offlineGeneration = not (negativePairGenOpt is None
                                 or negativePairGenOpt['type'] == 'none')
        masterIdByBugId = bugReportDatabase.getMasterIdByBugId()
        randomAnchor = negativePairGenOpt['random_anchor']

        if not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            negativePairGenerator = None
        else:
            pairGenType = negativePairGenOpt['type']

            if pairGenType == 'random':
                logger.info("Random Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = RandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'non_negative':
                logger.info("Non Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'misc_non_zero':
                logger.info("Misc Non Zero Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = MiscNonZeroRandomGen(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    trainingDataset.duplicateIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'product_component':
                logger.info("Product Component Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = ProductComponentRandomGen(
                    bugReportDatabase,
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'random_k':
                logger.info("Random K Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = KRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['k'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == "pre":
                logger.info("Pre-selected list generator")
                negativePairGenerator = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

            elif pairGenType == "positive_pre":
                logger.info("Positive Pre-selected list generator")
                negativePairGenerator = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)
            elif pairGenType == "misc_non_zero_pre":
                logger.info("Misc: non-zero and Pre-selected list generator")
                negativePairGenerator1 = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            elif pairGenType == "misc_non_zero_positive_pre":
                logger.info(
                    "Misc: non-zero and Positive Pre-selected list generator")
                negativePairGenerator1 = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))

            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pairGenType)

        if isinstance(lossFn, BCELoss):
            training_reader = PairBugDatasetReader(
                trainingFile,
                preprocessors,
                negativePairGenerator,
                randomInvertPair=args['random_switch'])
        elif isinstance(lossFn, TripletLoss):
            training_reader = TripletBugDatasetReader(
                trainingFile,
                preprocessors,
                negativePairGenerator,
                randomInvertPair=args['random_switch'])

        trainingLoader = DataLoader(training_reader,
                                    batch_size=batchSize,
                                    collate_fn=cmp_collate.collate,
                                    shuffle=True)
        logger.info("Training size: %s" % (len(trainingLoader.dataset)))

    # load validation
    if args.get('pairs_validation'):
        if isinstance(lossFn, BCELoss):
            validation_reader = PairBugDatasetReader(
                args.get('pairs_validation'), preprocessors)
        elif isinstance(lossFn, TripletLoss):
            validation_reader = TripletBugDatasetReader(
                args.get('pairs_validation'), preprocessors)

        validationLoader = DataLoader(validation_reader,
                                      batch_size=batchSize,
                                      collate_fn=cmp_collate.collate)

        logger.info("Validation size: %s" % (len(validationLoader.dataset)))
    else:
        validationLoader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    rankingScorer = GeneralScorer(
        model, preprocessors, device,
        PairBugCollate(inputHandlers, ignore_target=True),
        args['ranking_batch_size'], args['ranking_n_workers'])
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt, args['sample_size_rr_tr'])

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselectListRanking = PreselectListRanking(recallEstimationOpt,
                                                    args['sample_size_rr_val'])

    # LR scheduler
    lrSchedulerOpt = args.get('lr_scheduler', None)

    if lrSchedulerOpt is None:
        logger.info("Scheduler: Constant")
        lrSched = None
    elif lrSchedulerOpt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lrSchedulerOpt["step_size"], args["decay"]))
        lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"],
                         lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lrSchedulerOpt["decay"]))
        lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lrSchedulerOpt["decay"]))

        lrDecay = lrSchedulerOpt["decay"]
        lrSched = LambdaLR(optimizer, lambda epoch: 1 /
                           (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pairGenType)

    # Set training functions
    def trainingIteration(engine, batch):
        engine.kk = 0
        model.train()

        optimizer.zero_grad()
        x, y = cmp_collate.to(batch, device)
        output = model(*x)
        loss = lossFn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output, y

    def scoreDistanceTrans(output):
        if len(output) == 3:
            _, y_pred, y = output
        else:
            y_pred, y = output

        if lossFn == F.nll_loss:
            return torch.exp(y_pred[:, 1]), y
        elif isinstance(lossFn, (BCELoss)):
            return y_pred, y

    trainer = Engine(trainingIteration)
    trainingMetrics = {'training_loss': AverageLoss(lossFn)}

    if isinstance(lossFn, BCELoss):
        trainingMetrics['training_dist_target'] = MeanScoreDistance(
            output_transform=scoreDistanceTrans)
        trainingMetrics['training_acc'] = AccuracyWrapper(
            output_transform=thresholded_output_transform)
        trainingMetrics['training_precision'] = PrecisionWrapper(
            output_transform=thresholded_output_transform)
        trainingMetrics['training_recall'] = RecallWrapper(
            output_transform=thresholded_output_transform)
        # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        if not hasattr(engine, 'kk'):
            engine.kk = 0

        model.eval()

        with torch.no_grad():
            x, y = cmp_collate.to(batch, device)
            y_pred = model(*x)

            return y_pred, y

    validationMetrics = {
        'validation_loss':
        LossWrapper(lossFn,
                    output_transform=lambda x: (x[0], x[0][0])
                    if x[1] is None else x)
    }

    if isinstance(lossFn, BCELoss):
        validationMetrics['validation_dist_target'] = MeanScoreDistance(
            output_transform=scoreDistanceTrans)
        validationMetrics['validation_acc'] = AccuracyWrapper(
            output_transform=thresholded_output_transform)
        validationMetrics['validation_precision'] = PrecisionWrapper(
            output_transform=thresholded_output_transform)
        validationMetrics['validation_recall'] = RecallWrapper(
            output_transform=thresholded_output_transform)

    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    # recommendation
    recommendation_fn = generateRecommendationList

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lrSched:
            lrSched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validationLoader:
            evaluator.run(validationLoader)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        lastEpoch = args['epochs'] - epoch == 0

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRankingTrain,
                             rankingScorer,
                             bugReportDatabase,
                             None,
                             epoch,
                             "train",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             epoch,
                             "validation",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        if not lastEpoch:
            training_reader.sampleNewNegExamples(model, lossNoReduction)

        if args.get('save'):
            save_by_epoch = args['save_by_epoch']

            if save_by_epoch and epoch in save_by_epoch:
                file_name, file_extension = os.path.splitext(args['save'])
                file_path = file_name + '_epoch_{}'.format(
                    epoch) + file_extension
            else:
                file_path = args['save']

            modelInfo = {
                'model': model.state_dict(),
                'params': parametersToSave
            }

            logger.info("==> Saving Model: %s" % file_path)
            torch.save(modelInfo, file_path)

    if args.get('pairs_training'):
        trainer.run(trainingLoader, max_epochs=args['epochs'])
    elif args.get('pairs_validation'):
        # Evaluate Training
        evaluator.run(validationLoader)
        logMetrics(_run, logger, evaluator.state.metrics, 0)

        if recallEstimationOpt:
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             0,
                             "validation",
                             recommendationListfn=recommendation_fn)

    # Test Dataset (accuracy, recall, precision, F1)
    pair_test_dataset = args.get('pair_test_dataset')

    if pair_test_dataset is not None and len(pair_test_dataset) > 0:
        pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors)
        testLoader = DataLoader(pairTestReader,
                                batch_size=batchSize,
                                collate_fn=cmp_collate.collate)

        if not isinstance(cmp_collate, PairBugCollate):
            raise NotImplementedError(
                'Evaluation of pairs using tanh was not implemented yet')

        logger.info("Test size: %s" % (len(testLoader.dataset)))

        testMetrics = {
            'test_accuracy':
            ignite.metrics.Accuracy(
                output_transform=thresholded_output_transform),
            'test_precision':
            ignite.metrics.Precision(
                output_transform=thresholded_output_transform),
            'test_recall':
            ignite.metrics.Recall(
                output_transform=thresholded_output_transform),
            'test_predictions':
            PredictionCache(),
        }
        test_evaluator = Engine(validationIteration)

        # Add metrics to evaluator
        for name, metric in testMetrics.items():
            metric.attach(test_evaluator, name)

        test_evaluator.run(testLoader)

        for metricName, metricValue in test_evaluator.state.metrics.items():
            metric = testMetrics[metricName]

            if isinstance(metric, ignite.metrics.Accuracy):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'correct': metric._num_correct,
                    'total': metric._num_examples
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric,
                            (ignite.metrics.Precision, ignite.metrics.Recall)):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'tp': metric._true_positives.item(),
                    'total_positive': metric._positives.item()
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric, ConfusionMatrix):
                acc = cmAccuracy(metricValue)
                prec = cmPrecision(metricValue, False)
                recall = cmRecall(metricValue, False)
                f1 = 2 * (prec * recall) / (prec + recall + 1e-15)

                logger.info({
                    'type':
                    'metric',
                    'label':
                    metricName,
                    'accuracy':
                    np.float(acc),
                    'precision':
                    prec.cpu().numpy().tolist(),
                    'recall':
                    recall.cpu().numpy().tolist(),
                    'f1':
                    f1.cpu().numpy().tolist(),
                    'confusion_matrix':
                    metricValue.cpu().numpy().tolist(),
                    'epoch':
                    None
                })

                _run.log_scalar('test_f1', f1[1])
            elif isinstance(metric, PredictionCache):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'predictions': metric.predictions
                })

    # Calculate recall rate
    recallRateOpt = args.get('recall_rate', {'type': 'none'})
    if recallRateOpt['type'] != 'none':
        if recallRateOpt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])

            rankingClass = SunRanking(bugReportDatabase, recallRateDataset,
                                      recallRateOpt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recallRateOpt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])
            rankingClass = DeshmukhRanking(bugReportDatabase,
                                           recallRateDataset)
            group_by_master = recallRateOpt['group_by_master']
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recallRateOpt['type'])

        logRankingResult(_run,
                         logger,
                         rankingClass,
                         rankingScorer,
                         bugReportDatabase,
                         recallRateOpt["result_file"],
                         0,
                         None,
                         group_by_master,
                         recommendationListfn=recommendation_fn)
예제 #8
0
# coding=utf-8
import sys

from __builtin__ import open

labelsFilename = "/home/eraldo/lia/ner/labels.txt"  #sys.argv[0]
corpusFilename = "/home/eraldo/lia/ner/first.txt_dev.txt"  #sys.argv[1]
predictionFilename = "/home/eraldo/lia/ner/epoch69.txt"  #sys.argv[2]

from data.Lexicon import Lexicon

labels = Lexicon.fromTextFile(labelsFilename, hasUnknowSymbol=False)

corpusFile = open(corpusFilename)

predictionFile = open(predictionFilename)
prediction = [
    int(s) for s in predictionFile.readline().strip()[:-1].split(',')
]
predictionFile.close()

i = 0
for line in corpusFile:
    ftrs = line.strip().split('\t')
    if len(ftrs[1]) > 1:
        ftrs[1] = 'I-' + ftrs[1][:3]
    pred = labels.getLexicon(prediction[i])
    if len(pred) > 1:
        pred = 'I-' + pred[:3]
    print "{0} POS {1} {2}".format(ftrs[0], ftrs[1], pred)
    i += 1
예제 #9
0
def mainWnnNer(args):
    # Initializing parameters.
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    log.info({"type": "args", "args": args})

    # GPU configuration.
    log.info({"floatX": str(theano.config.floatX), "device": str(theano.config.device)})

    # Parameters.
    # lr = args.lr
    # startSymbol = args.start_symbol
    # endSymbol = args.end_symbol
    # numEpochs = args.num_epochs
    # shuffle = args.shuffle
    # normalization = args.normalization
    # wordWindowSize = args.word_window_size
    # hiddenLayerSize = args.hidden_size
    # hiddenActFunctionName = args.hidden_activation_function
    # embeddingSize = args.word_emb_size
    # batchSize = args.batch_size
    # structGrad = args.struct_grad
    # charStructGrad = args.char_struct_grad
    #
    # charEmbeddingSize = args.char_emb_size
    # charWindowSize = args.char_window_size
    # charConvSize = args.conv_size

    # Word filters.
    log.info("Loading word filters...")
    wordFilters = getFilters(args.word_filters, log)

    # Loading/creating word lexicon and word embedding.
    if args.word_embedding is not None:
        log.info("Loading word embedding...")
        wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon")
    elif args.word_lexicon is not None:
        log.info("Loading word lexicon...")
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon")
        wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=args.word_emb_size)
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        sys.exit(1)

    # Loading char lexicon.
    log.info("Loading char lexicon...")
    charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon")

    # Character embedding.
    charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=args.char_emb_size)

    # Loading label lexicon.
    log.info("Loading label lexicon...")
    labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon")

    # Normalize the word embedding
    if args.normalization is not None:
        normFactor = 1
        if args.norm_factor is not None:
            normFactor = args.norm_factor

        if args.normalization == "minmax":
            log.info("Normalizing word embedding: minmax")
            wordEmbedding.minMaxNormalization(norm_coef=normFactor)
        elif args.normalization == "mean":
            log.info("Normalizing word embedding: mean")
            wordEmbedding.meanNormalization(norm_coef=normFactor)
        else:
            log.error("Unknown normalization method: %s" % args.normalization)
            sys.exit(1)
    elif args.normFactor is not None:
        log.error("Parameter norm_factor cannot be present without normalization.")
        sys.exit(1)

    dictionarySize = wordEmbedding.getNumberOfVectors()
    log.info("Size of word lexicon is %d and word embedding size is %d" % (dictionarySize, args.word_emb_size))

    # Setup the input and (golden) output generators (readers).
    inputGenerators = [
        WordWindowGenerator(args.word_window_size, wordLexicon, wordFilters, args.start_symbol, args.end_symbol),
        CharacterWindowGenerator(lexicon=charLexicon, numMaxChar=20, charWindowSize=args.char_window_size,
                                 wrdWindowSize=args.word_window_size, artificialChar="ART_CHAR", startPadding="</s>",
                                 startPaddingWrd=args.start_symbol, endPaddingWrd=args.end_symbol,
                                 filters=getFilters([], log))
    ]
    outputGenerator = LabelGenerator(labelLexicon)

    if args.cv is not None:
        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators,
                                          [outputGenerator], args.batch_size, shuffle=args.shuffle,
                                          numCVFolds=args.cv.numFolds)
        cvGenerators = trainIterator.getCVGenerators()
        iFold = 0
        numFolds = len(cvGenerators)
        for train, dev in cvGenerators:
            log.info({"cv": {"fold": iFold, "numFolds": numFolds}})
            trainNetwork(args, log, trainIterator=train, devIterator=dev, wordEmbedding=wordEmbedding,
                         charEmbedding=charEmbedding, borrow=False, labelLexicon=labelLexicon)
    else:
        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators,
                                          [outputGenerator], args.batch_size, shuffle=args.shuffle)

        # Get dev inputs and (golden) outputs.
        devIterator = None
        if args.dev is not None:
            log.info("Reading development examples")
            devIterator = SyncBatchIterator(TokenLabelPerLineReader(args.dev, labelTknSep='\t'), inputGenerators,
                                            [outputGenerator], sys.maxint, shuffle=False)

        trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow=True,
                     labelLexicon=labelLexicon)

    # Testing.
    if args.test:
        log.info("Reading test dataset...")
        testIterator = SyncBatchIterator(TokenLabelPerLineReader(args.test, labelTknSep='\t'), inputGenerators,
                                         [outputGenerator], sys.maxint, shuffle=False)

        log.info("Testing...")
        wnnModel.test(testIterator)

    log.info("Done!")
예제 #10
0
def main():
    full_path = os.path.realpath(__file__)
    path, filename = os.path.split(full_path)
    logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={})
    log = logging.getLogger(__name__)

    if len(sys.argv) != 3:
        log.error("Missing argument: <JSON config file> or/and <Input file>")
        exit(1)

    argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1])
    args = dict2obj(argsDict, 'ShortDocArguments')
    logging.getLogger(__name__).info(argsDict)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None

    wordEmbedding = None
    if args.word_embedding:
        log.info("Reading W2v File")
        (wordLexicon,
         wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding,
                                                 unknownSymbol="__UNKNOWN__")
        wordLexicon.stopAdd()
    elif args.word_lexicon and args.word_emb_size:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon,
                                           hasUnknowSymbol=False)
        wordEmbedding = Embedding(wordLexicon,
                                  embeddingSize=args.word_emb_size)
        wordLexicon.stopAdd()
    else:
        log.error(
            "You must provide argument word_embedding or word_lexicon and word_emb_size"
        )

    # Create the lexicon of labels.
    labelLexicon = None
    if args.labels is not None:
        if args.label_lexicon is not None:
            log.error(
                "Only one of the parameters label_lexicon and labels can be provided!"
            )
            exit(1)
        labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False)
    elif args.label_lexicon is not None:
        labelLexicon = Lexicon.fromTextFile(args.label_lexicon,
                                            hasUnknowSymbol=False)
    else:
        log.error(
            "One of the parameters label_lexicon or labels must be provided!")
        exit(1)

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = tensor.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = tensor.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # if not args.train and args.load_wordEmbedding:
    #     attrs = np.load(args.load_wordEmbedding)
    #     embeddingLayer.load(attrs)
    #     log.info("Loaded word embedding (shape %s) from file %s" % (
    #         str(attrs[0].shape), args.load_wordEmbedding))

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convW = None
    convb = None

    if not args.train and args.load_conv:
        convNPY = np.load(args.load_conv)
        convW = convNPY[0]
        convb = convNPY[1]
        log.info("Loaded convolutional layer (shape %s) from file %s" %
                 (str(convW.shape), args.load_conv))

    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=convW,
                             b=convb,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # Hidden layer.
    if not args.train and args.load_hiddenLayer:
        hiddenNPY = np.load(args.load_hiddenLayer)
        W1 = hiddenNPY[0]
        b1 = hiddenNPY[1]
        log.info("Loaded hidden layer (shape %s) from file %s" %
                 (str(W1.shape), args.load_hiddenLayer))

    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1,
                               b=b1,
                               weightInitialization=weightInit)

    hiddenAct = ActivationLayer(hiddenLinear, tanh)

    # Entrada linear da camada softmax.
    if not args.train and args.load_softmax:
        hiddenNPY = np.load(args.load_softmax)
        W2 = hiddenNPY[0]
        b2 = hiddenNPY[1]
        log.info("Loaded softmax layer (shape %s) from file %s" %
                 (str(W2.shape), args.load_softmax))

    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())

    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    if args.label_weights is not None and len(
            args.label_weights) != labelLexicon.getLen():
        log.error(
            "Number of label weights (%d) is different from number of labels (%d)!"
            % (len(args.label_weights), labelLexicon.getLen()))
    nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights)
    loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction,
                                outLabel)

    # Input generators: word window.
    inputGenerators = [
        WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol,
                            endSymbol)
    ]

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]
    # outputGenerators = [lambda label: labelLexicon.put(label)]

    evalPerIteration = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    decay = None
    if args.decay == "none":
        decay = 0.0
    elif args.decay == "linear":
        decay = 1.0
    else:
        log.error("Unknown decay parameter %s." % args.decay)
        exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error("Unknown algorithm: %s." % args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       mode=mode)

    wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters,
                                     startSymbol, endSymbol)

    # GETS HIDDEN LAYER:
    # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow)

    # GRAPH FOR PREDICTION LAYER
    graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode)

    lblTxt = ["Sim", "Nao"]

    tweets = []
    with open(sys.argv[2]) as inputFile:
        content = inputFile.readlines()
    for line in content:
        tweets.append(line.decode('utf-8').encode('utf-8'))
    #print tweets
    # graph.getResultsFor(t) retorna a predição para dado Tweet t
    try:
        output_file = open("Output.txt", "w")
    except:
        print "Falha em criar o arquivo de saida\n"
    try:
        for t in tweets:
            output_file.write(
                t.replace('\n', '').replace('\t', '') + "\t " +
                lblTxt[graph.getResultsFor(t)] + "\n")
        print "Resultados gerados com sucesso!\n"
    except:
        print "Erro na geração de resultados\n"
예제 #11
0
def main(_run, _config, _seed, _log):
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)

    device = torch.device('cuda' if args['cuda'] else "cpu")
    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # Setting the parameter to save and loading parameters
    important_parameters = ['dbr_cnn']
    parameters_to_save = dict([(name, args[name])
                               for name in important_parameters])

    if args['load'] is not None:
        map_location = (
            lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu'
        model_info = torch.load(args['load'], map_location=map_location)
        model_state = model_info['model']

        for param_name, param_value in model_info['params'].items():
            args[param_name] = param_value
    else:
        model_state = None

    # Set basic variables
    preprocessors = PreprocessorList()
    input_handlers = []
    report_database = BugReportDatabase.fromJson(args['bug_database'])
    batchSize = args['batch_size']
    dbr_cnn_opt = args['dbr_cnn']

    # Loading word embedding and lexicon
    emb = np.load(dbr_cnn_opt["word_embedding"])
    padding_sym = "</s>"

    lexicon = Lexicon(unknownSymbol=None)
    with codecs.open(dbr_cnn_opt["lexicon"]) as f:
        for l in f:
            lexicon.put(l.strip())

    lexicon.setUnknown("UUUKNNN")
    padding_id = lexicon.getLexiconIndex(padding_sym)
    embedding = Embedding(lexicon, emb, paddingIdx=padding_id)

    logger.info("Lexicon size: %d" % (lexicon.getLen()))
    logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))

    # Load filters and tokenizer
    filters = loadFilters(dbr_cnn_opt['filters'])

    if dbr_cnn_opt['tokenizer'] == 'default':
        logger.info("Use default tokenizer to tokenize summary information")
        tokenizer = MultiLineTokenizer()
    elif dbr_cnn_opt['tokenizer'] == 'white_space':
        logger.info(
            "Use white space tokenizer to tokenize summary information")
        tokenizer = WhitespaceTokenizer()
    else:
        raise ArgumentError(
            "Tokenizer value %s is invalid. You should choose one of these: default and white_space"
            % dbr_cnn_opt['tokenizer'])

    # Add preprocessors
    preprocessors.append(
        DBR_CNN_CategoricalPreprocessor(dbr_cnn_opt['categorical_lexicon'],
                                        report_database))
    preprocessors.append(
        SummaryDescriptionPreprocessor(lexicon, report_database, filters,
                                       tokenizer, padding_id))

    # Add input_handlers
    input_handlers.append(DBRDCNN_CategoricalInputHandler())
    input_handlers.append(
        TextCNNInputHandler(padding_id, min(dbr_cnn_opt["window"])))

    # Create Model
    model = DBR_CNN(embedding, dbr_cnn_opt["window"], dbr_cnn_opt["nfilters"],
                    dbr_cnn_opt['update_embedding'])

    model.to(device)

    if model_state:
        model.load_state_dict(model_state)

    # Set loss function
    logger.info("Using BCE Loss")
    loss_fn = BCELoss()
    loss_no_reduction = BCELoss(reduction='none')
    cmp_collate = PairBugCollate(input_handlers,
                                 torch.float32,
                                 unsqueeze_target=True)

    # Loading the training and setting how the negative example will be generated.
    if args.get('pairs_training'):
        negative_pair_gen_opt = args.get('neg_pair_generator', )
        pairsTrainingFile = args.get('pairs_training')
        random_anchor = negative_pair_gen_opt['random_anchor']

        offlineGeneration = not (negative_pair_gen_opt is None
                                 or negative_pair_gen_opt['type'] == 'none')

        if not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            pair_training_reader = PairBugDatasetReader(
                pairsTrainingFile,
                preprocessors,
                randomInvertPair=args['random_switch'])
        else:
            pair_gen_type = negative_pair_gen_opt['type']
            master_id_by_bug_id = report_database.getMasterIdByBugId()

            if pair_gen_type == 'random':
                logger.info("Random Negative Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = RandomGenerator(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, master_id_by_bug_id)

            elif pair_gen_type == 'non_negative':
                logger.info("Non Negative Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negative_pair_gen_opt['rate'],
                    bug_ids,
                    master_id_by_bug_id,
                    negative_pair_gen_opt['n_tries'],
                    device,
                    randomAnchor=random_anchor)
            elif pair_gen_type == 'misc_non_zero':
                logger.info("Misc Non Zero Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = MiscNonZeroRandomGen(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, training_dataset.duplicateIds,
                    master_id_by_bug_id, device,
                    negative_pair_gen_opt['n_tries'],
                    negative_pair_gen_opt['random_anchor'])
            elif pair_gen_type == 'random_k':
                logger.info("Random K Negative Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = KRandomGenerator(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, master_id_by_bug_id, negative_pair_gen_opt['k'],
                    device)
            elif pair_gen_type == "pre":
                logger.info("Pre-selected list generator")
                negative_pair_generator = PreSelectedGenerator(
                    negative_pair_gen_opt['pre_list_file'], preprocessors,
                    negative_pair_gen_opt['rate'], master_id_by_bug_id,
                    negative_pair_gen_opt['preselected_length'])
            elif pair_gen_type == "misc_non_zero_pre":
                logger.info("Pre-selected list generator")

                negativePairGenerator1 = PreSelectedGenerator(
                    negative_pair_gen_opt['pre_list_file'], preprocessors,
                    negative_pair_gen_opt['rate'], master_id_by_bug_id,
                    negative_pair_gen_opt['preselected_length'])

                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, master_id_by_bug_id, device,
                    negative_pair_gen_opt['n_tries'])

                negative_pair_generator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pair_gen_type)

            pair_training_reader = PairBugDatasetReader(
                pairsTrainingFile,
                preprocessors,
                negative_pair_generator,
                randomInvertPair=args['random_switch'])

        training_loader = DataLoader(pair_training_reader,
                                     batch_size=batchSize,
                                     collate_fn=cmp_collate.collate,
                                     shuffle=True)
        logger.info("Training size: %s" % (len(training_loader.dataset)))

    # load validation
    if args.get('pairs_validation'):
        pair_validation_reader = PairBugDatasetReader(
            args.get('pairs_validation'), preprocessors)
        validation_loader = DataLoader(pair_validation_reader,
                                       batch_size=batchSize,
                                       collate_fn=cmp_collate.collate)

        logger.info("Validation size: %s" % (len(validation_loader.dataset)))
    else:
        validation_loader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'],
                              momentum=args['momentum'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    ranking_scorer = DBR_CNN_Scorer(preprocessors[0], preprocessors[1],
                                    input_handlers[0], input_handlers[1],
                                    model, device, args['ranking_batch_size'])
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt)

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselect_list_ranking = PreselectListRanking(recallEstimationOpt)

    lr_scheduler_opt = args.get('lr_scheduler', None)

    if lr_scheduler_opt is None or lr_scheduler_opt['type'] == 'constant':
        logger.info("Scheduler: Constant")
        lr_sched = None
    elif lr_scheduler_opt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lr_scheduler_opt["step_size"], args["decay"]))
        lr_sched = StepLR(optimizer, lr_scheduler_opt["step_size"],
                          lr_scheduler_opt["decay"])
    elif lr_scheduler_opt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lr_scheduler_opt["decay"]))
        lr_sched = ExponentialLR(optimizer, lr_scheduler_opt["decay"])
    elif lr_scheduler_opt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lr_scheduler_opt["decay"]))

        lrDecay = lr_scheduler_opt["decay"]
        lr_sched = LambdaLR(optimizer, lambda epoch: 1 /
                            (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pair_gen_type)

    # Set training functions
    def trainingIteration(engine, batch):
        model.train()

        optimizer.zero_grad()
        x, y = cmp_collate.to(batch, device)
        output = model(*x)
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output, y

    trainer = Engine(trainingIteration)
    negTarget = 0.0 if isinstance(loss_fn, NLLLoss) else -1.0

    trainingMetrics = {
        'training_loss':
        AverageLoss(loss_fn),
        'training_acc':
        AccuracyWrapper(output_transform=thresholded_output_transform),
        'training_precision':
        PrecisionWrapper(output_transform=thresholded_output_transform),
        'training_recall':
        RecallWrapper(output_transform=thresholded_output_transform),
    }

    # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        model.eval()

        with torch.no_grad():
            x, y = cmp_collate.to(batch, device)
            y_pred = model(*x)

            return y_pred, y

    validationMetrics = {
        'validation_loss':
        LossWrapper(loss_fn),
        'validation_acc':
        AccuracyWrapper(output_transform=thresholded_output_transform),
        'validation_precision':
        PrecisionWrapper(output_transform=thresholded_output_transform),
        'validation_recall':
        RecallWrapper(output_transform=thresholded_output_transform),
    }
    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lr_sched:
            lr_sched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validation_loader:
            evaluator.run(validation_loader)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        lastEpoch = args['epochs'] - epoch == 0

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run, logger, preselectListRankingTrain,
                             ranking_scorer, report_database, None, epoch,
                             "train")
            ranking_scorer.free()

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run, logger, preselect_list_ranking,
                             ranking_scorer, report_database,
                             args.get("ranking_result_file"), epoch,
                             "validation")
            ranking_scorer.free()

        if not lastEpoch:
            pair_training_reader.sampleNewNegExamples(model, loss_no_reduction)

        if args.get('save'):
            save_by_epoch = args['save_by_epoch']

            if save_by_epoch and epoch in save_by_epoch:
                file_name, file_extension = os.path.splitext(args['save'])
                file_path = file_name + '_epoch_{}'.format(
                    epoch) + file_extension
            else:
                file_path = args['save']

            modelInfo = {
                'model': model.state_dict(),
                'params': parameters_to_save
            }

            logger.info("==> Saving Model: %s" % file_path)
            torch.save(modelInfo, file_path)

    if args.get('pairs_training'):
        trainer.run(training_loader, max_epochs=args['epochs'])
    elif args.get('pairs_validation'):
        # Evaluate Training
        evaluator.run(validation_loader)
        logMetrics(logger, evaluator.state.metrics)

        if recallEstimationOpt:
            logRankingResult(_run, logger, preselect_list_ranking,
                             ranking_scorer, report_database,
                             args.get("ranking_result_file"), 0, "validation")

    # Test Dataset (accuracy, recall, precision, F1)
    pair_test_dataset = args.get('pair_test_dataset')

    if pair_test_dataset is not None and len(pair_test_dataset) > 0:
        pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors)
        testLoader = DataLoader(pairTestReader,
                                batch_size=batchSize,
                                collate_fn=cmp_collate.collate)

        if not isinstance(cmp_collate, PairBugCollate):
            raise NotImplementedError(
                'Evaluation of pairs using tanh was not implemented yet')

        logger.info("Test size: %s" % (len(testLoader.dataset)))

        testMetrics = {
            'test_accuracy':
            ignite.metrics.Accuracy(
                output_transform=thresholded_output_transform),
            'test_precision':
            ignite.metrics.Precision(
                output_transform=thresholded_output_transform),
            'test_recall':
            ignite.metrics.Recall(
                output_transform=thresholded_output_transform),
            'test_predictions':
            PredictionCache(),
        }
        test_evaluator = Engine(validationIteration)

        # Add metrics to evaluator
        for name, metric in testMetrics.items():
            metric.attach(test_evaluator, name)

        test_evaluator.run(testLoader)

        for metricName, metricValue in test_evaluator.state.metrics.items():
            metric = testMetrics[metricName]

            if isinstance(metric, ignite.metrics.Accuracy):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'correct': metric._num_correct,
                    'total': metric._num_examples
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric,
                            (ignite.metrics.Precision, ignite.metrics.Recall)):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'tp': metric._true_positives.item(),
                    'total_positive': metric._positives.item()
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric, ConfusionMatrix):
                acc = cmAccuracy(metricValue)
                prec = cmPrecision(metricValue, False)
                recall = cmRecall(metricValue, False)
                f1 = 2 * (prec * recall) / (prec + recall + 1e-15)

                logger.info({
                    'type':
                    'metric',
                    'label':
                    metricName,
                    'accuracy':
                    np.float(acc),
                    'precision':
                    prec.cpu().numpy().tolist(),
                    'recall':
                    recall.cpu().numpy().tolist(),
                    'f1':
                    f1.cpu().numpy().tolist(),
                    'confusion_matrix':
                    metricValue.cpu().numpy().tolist(),
                    'epoch':
                    None
                })

                _run.log_scalar('test_f1', f1[1])
            elif isinstance(metric, PredictionCache):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'predictions': metric.predictions
                })

    # Calculate recall rate
    recall_rate_opt = args.get('recall_rate', {'type': 'none'})
    if recall_rate_opt['type'] != 'none':
        if recall_rate_opt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recall_rate_opt['type']))
            recall_rate_dataset = BugDataset(recall_rate_opt['dataset'])

            ranking_class = SunRanking(report_database, recall_rate_dataset,
                                       recall_rate_opt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recall_rate_opt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recall_rate_opt['type']))
            recall_rate_dataset = BugDataset(recall_rate_opt['dataset'])
            ranking_class = DeshmukhRanking(report_database,
                                            recall_rate_dataset)
            group_by_master = recall_rate_opt['group_by_master']
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recall_rate_opt['type'])

        logRankingResult(
            _run,
            logger,
            ranking_class,
            ranking_scorer,
            report_database,
            recall_rate_opt["result_file"],
            0,
            None,
            group_by_master,
        )
예제 #12
0
def main(args):
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.wv_normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)
        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None
    hiddenActFunction = tanh

    if args.word_embedding:
        log.info("Reading W2v File")
        (lexicon,
         wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding,
                                                 unknownSymbol='unknown')
        lexicon.stopAdd()
    else:
        wordEmbedding = EmbeddingFactory().createRandomEmbedding(
            args.word_emb_size)

    # Get the inputs and output
    if args.labels:
        labelLexicon = Lexicon.fromTextFile(args.labels, hasUnknowSymbol=False)
    else:
        labelLexicon = Lexicon()

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = T.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = T.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = SigmoidGlorot(
    ) if hiddenActFunction == sigmoid else GlorotUniform()

    # Convolution layer. Convolução no texto de um documento.
    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=None,
                             b=None,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # Generate word windows.
    wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, lexicon,
                                                     filters, startSymbol,
                                                     endSymbol)

    # List of input generators.
    inputGenerators = [wordWindowFeatureGenerator]

    # Hidden layer.
    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1,
                               b=b1,
                               weightInitialization=weightInit)
    hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction)

    # Entrada linear da camada softmax.
    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())
    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    loss = NegativeLogLikelihoodOneExample().calculateError(
        softmaxAct.getOutput()[0], prediction, outLabel)

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]

    if args.train:
        trainDatasetReader = DocReader(args.train)

        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(trainDatasetReader,
                                          inputGenerators,
                                          outputGenerators,
                                          -1,
                                          shuffle=shuffle)
        lexicon.stopAdd()
        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error(
                "Argument eval_per_iteration cannot be used without a dev argument."
            )
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = DocReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            -1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    if args.decay == "linear":
        decay = 1.0
    elif args.decay == "none":
        decay = 0.0
    else:
        log.error("Unknown decay strategy %s. Expected: none or linear." %
                  args.decay)
        sys.exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error(
            "Unknown algorithm: %s. Expected values are: adagrad or sgd." %
            args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction)
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction)
        ]

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       trainMetrics=trainMetrics,
                       evalMetrics=evalMetrics,
                       testMetrics=testMetrics,
                       mode=mode)

    # Training
    if trainIterator:
        log.info("Training")
        model.train(trainIterator,
                    numEpochs,
                    devIterator,
                    evalPerIteration=evalPerIteration)

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = DocReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         -1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
예제 #13
0
def mainWnnNegativeSampling(args):
    # Reading parameters
    embeddingMatrix = None
    wordEmbeddingSize = args.word_embedding_size
    windowSize = args.window_size
    hiddenLayerSize = args.hidden_size
    startSymbol = args.start_symbol
    # endSymbol = args.end_symbol
    endSymbol = startSymbol
    noiseRate = args.noise_rate

    # todo: o algoritmo não suporta mini batch. Somente treinamento estocástico.
    batchSize = 1

    shuffle = args.shuffle
    lr = args.lr
    numEpochs = args.num_epochs
    power = args.power

    minLr = args.min_lr
    numExUpdLr = args.num_examples_updt_lr

    log = logging.getLogger(__name__)

    log.info(str(args))

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)
    #
    # if args.decay.lower() == "normal":
    #     decay = 0.0
    # elif args.decay.lower() == "divide_epoch":
    #     decay = 1.0

    parametersToSaveOrLoad = {"hidden_size", "window_size", "start_symbol"}

    # Calculate the frequency of each word
    trainReader = TokenReader(args.train)
    wordLexicon = Lexicon("UUKNNN", "lexicon")
    wordLexicon.put(startSymbol, False)

    totalNumOfTokens = 0
    for tokens, labels in trainReader.read():
        # we don't count the </s>, because this token is only insert in the sentence to count its frequency.
        totalNumOfTokens += len(tokens)

        # Word2vec considers that the number of lines is the frequency of </s>
        tokens += [startSymbol]

        for token in tokens:
            wordLexicon.put(token)

    # Prune the words with the frequency less than min_count
    wordLexicon.prune(args.min_count)
    wordLexicon.stopAdd()

    # Calculte the unigram distribution
    frequency = np.power(wordLexicon.getFrequencyOfAllWords(), power)
    total = float(frequency.sum())

    # # Print the distribution of all words
    # for _ in xrange(len(frequency)):
    #     print "%s\t%d\t%.4f" % (wordLexicon.getLexicon(_), frequency[_],frequency[_]/float(total))

    sampler = Sampler(frequency / float(total))

    # Create a random embedding for each word
    wordEmbedding = Embedding(wordLexicon, None, wordEmbeddingSize)
    log.info("Lexicon size: %d" % (wordLexicon.getLen()))

    # Create NN
    x = T.lmatrix("word_window")
    y = T.lvector("labels")

    wordEmbeddingLayer = EmbeddingLayer(x,
                                        wordEmbedding.getEmbeddingMatrix(),
                                        name="embedding")
    flatten = FlattenLayer(wordEmbeddingLayer)

    linear1 = LinearLayer(flatten,
                          wordEmbeddingSize * windowSize,
                          hiddenLayerSize,
                          name="linear1")
    act1 = ActivationLayer(linear1, tanh)

    # Softmax regression. It's like a logistic regression
    linear2 = LinearLayer(act1,
                          hiddenLayerSize,
                          1,
                          weightInitialization=ZeroWeightGenerator(),
                          name="linear_softmax_regresion")

    act2 = ActivationLayer(linear2, sigmoid)
    # We clip the output of -sigmoid, because this output can be 0  and ln(0) is infinite, which can cause problems.
    output = T.flatten(T.clip(act2.getOutput(), 10**-5, 1 - 10**-5))

    # Loss Functions
    negativeSamplingLoss = T.nnet.binary_crossentropy(output, y).sum()
    # Set training
    inputGenerators = [
        WordWindowGenerator(windowSize, wordLexicon, [], startSymbol,
                            endSymbol)
    ]

    outputGenerators = [ConstantLabel(labelLexicon=None, label=1)]

    trainIterator = SyncBatchIterator(trainReader, inputGenerators,
                                      outputGenerators, batchSize, shuffle)

    trainMetrics = [LossMetric("lossTrain", negativeSamplingLoss)]

    allLayers = act2.getLayerSet()

    # opt = SGD(lr=lr, decay=decay)
    opt = SGD(lr=lr)

    model = NegativeSamplingModel(args.t, noiseRate, sampler, minLr,
                                  numExUpdLr, totalNumOfTokens, numEpochs, [x],
                                  [y], allLayers, opt, negativeSamplingLoss,
                                  trainMetrics)
    # Save Model
    if args.save_model:
        savePath = args.save_model
        objsToSave = list(act2.getLayerSet()) + [wordLexicon]

        modelWriter = ModelWriter(savePath, objsToSave, args,
                                  parametersToSaveOrLoad)

    # Training
    model.train(trainIterator, numEpochs=numEpochs, callbacks=[])

    if args.save_model:
        modelWriter.save()
예제 #14
0
def mainWnn(args):
    ################################################
    # Initializing parameters
    ##############################################
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters",
                              "alg", "hidden_activation_function", "word_window_size", "char_window_size",
                              "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size",
                              "use_capitalization", "start_symbol", "end_symbol", "with_hidden"}

    # Load parameters of the saving model
    if args.load_model:
        persistentManager = H5py(args.load_model)
        savedParameters = json.loads(persistentManager.getAttribute("parameters"))

        if savedParameters.get("charwnn_filters", None) != None:
            savedParameters["char_filters"] = savedParameters["charwnn_filters"]
            savedParameters.pop("charwnn_filters")
            print savedParameters

        log.info("Loading parameters of the model")
        args = args._replace(**savedParameters)

    log.info(str(args))

    # Read the parameters
    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization.lower() if args.normalization is not None else None
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    hiddenActFunctionName = args.hidden_activation_function
    embeddingSize = args.word_emb_size

    withCharWNN = args.with_charwnn
    charEmbeddingSize = args.char_emb_size
    charWindowSize = args.char_window_size
    startSymbolChar = "</s>"

    suffixEmbSize = args.suffix_emb_size
    capEmbSize = args.cap_emb_size

    useSuffixFeatures = args.suffix_size > 0
    useCapFeatures = args.use_capitalization

    # Insert the character that will be used to fill the matrix
    # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication.
    artificialChar = "ART_CHAR"

    # TODO: the maximum number of characters of word is fixed in 20.
    numMaxChar = 20

    if args.alg == "window_stn":
        isSentenceModel = True
    elif args.alg == "window_word":
        isSentenceModel = False
    else:
        raise Exception("The value of model_type isn't valid.")

    batchSize = -1 if isSentenceModel else args.batch_size
    wordFilters = []

    # Lendo Filtros do wnn
    log.info("Lendo filtros básicos")
    wordFilters = getFilters(args.word_filters, log)

    # Lendo Filtros do charwnn
    log.info("Lendo filtros do charwnn")
    charFilters = getFilters(args.char_filters, log)

    # Lendo Filtros do suffix
    log.info("Lendo filtros do sufixo")
    suffixFilters = getFilters(args.suffix_filters, log)

    # Lendo Filtros da capitalização
    log.info("Lendo filtros da capitalização")
    capFilters = getFilters(args.cap_filters, log)

    ################################################
    # Create the lexicon and go out after this
    ################################################
    if args.create_only_lexicon:
        inputGenerators = []
        lexiconsToSave = []

        if args.word_lexicon and not os.path.exists(args.word_lexicon):
            wordLexicon = Lexicon("UUUNKKK", "labelLexicon")

            inputGenerators.append(
                WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol))
            lexiconsToSave.append((wordLexicon, args.word_lexicon))

        if not os.path.exists(args.label_file):
            labelLexicon = Lexicon(None, "labelLexicon")
            outputGenerator = [LabelGenerator(labelLexicon)]
            lexiconsToSave.append((labelLexicon, args.label_file))
        else:
            outputGenerator = None

        if args.char_lexicon and not os.path.exists(args.char_lexicon):
            charLexicon = Lexicon("UUUNKKK", "charLexicon")

            charLexicon.put(startSymbolChar)
            charLexicon.put(artificialChar)

            inputGenerators.append(
                CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                         startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                         filters=charFilters))

            lexiconsToSave.append((charLexicon, args.char_lexicon))

        if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon):
            suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon")

            if args.suffix_size <= 0:
                raise Exception(
                    "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.")

            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

            lexiconsToSave.append((suffixLexicon, args.suffix_lexicon))

        if args.cap_lexicon and not os.path.exists(args.cap_lexicon):
            capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon")

            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

            lexiconsToSave.append((capLexicon, args.cap_lexicon))

        if len(inputGenerators) == 0:
            inputGenerators = None

        if not (inputGenerators or outputGenerator):
            log.info("All lexicons have been generated.")
            return

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize,
                                        shuffle=shuffle)

        for lexicon, pathToSave in lexiconsToSave:
            lexicon.save(pathToSave)

        log.info("Lexicons were generated with success!")

        return

    ################################################
    # Starting training
    ###########################################

    if withCharWNN and (useSuffixFeatures or useCapFeatures):
        raise Exception("It's impossible to use hand-crafted features with Charwnn.")

    # Read word lexicon and create word embeddings
    if args.load_model:
        wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon")
        vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer")

        wordEmbedding = Embedding(wordLexicon, vectors)

    elif args.word_embedding:
        wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon")
    elif args.word_lexicon:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon")
        wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize)
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Read char lexicon and create char embeddings
    if withCharWNN:
        if args.load_model:
            charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon")
            vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                     "char_convolution_layer")

            charEmbedding = Embedding(charLexicon, vectors)
        elif args.char_lexicon:
            charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon")
            charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize)
        else:
            log.error("You need to set one of these parameters: load_model or char_lexicon")
            return
    else:
        # Read suffix lexicon if suffix size is greater than 0
        if useSuffixFeatures:
            if args.load_model:
                suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "suffix_embedding")

                suffixEmbedding = Embedding(suffixLexicon, vectors)
            elif args.suffix_lexicon:
                suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon")
                suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or suffix_lexicon")
                return

        # Read capitalization lexicon
        if useCapFeatures:
            if args.load_model:
                capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "cap_embedding")

                capEmbedding = Embedding(capLexicon, vectors)
            elif args.cap_lexicon:
                capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon")
                capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or cap_lexicon")
                return

    # Read labels
    if args.load_model:
        labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon")
    elif args.label_file:
        labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon")
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Normalize the word embedding
    if not normalizeMethod:
        pass
    elif normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    else:
        log.error("Unknown normalization method: %s" % normalizeMethod)
        sys.exit(1)

    if normalizeMethod is not None and args.load_model is not None:
        log.warn("The word embedding of model was normalized. This can change the result of test.")

    # Build neural network
    if isSentenceModel:
        raise NotImplementedError("Sentence model is not implemented!")
    else:
        wordWindow = T.lmatrix("word_window")
        inputModel = [wordWindow]

        wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True,
                                            name="word_embedding_layer")
        flatten = FlattenLayer(wordEmbeddingLayer)

        if withCharWNN:
            # Use the convolution
            log.info("Using charwnn")
            convSize = args.conv_size

            if args.charwnn_with_act:
                charAct = tanh
            else:
                charAct = None

            charWindowIdxs = T.ltensor4(name="char_window_idx")
            inputModel.append(charWindowIdxs)

            charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(),
                                                                 numMaxChar, convSize, charWindowSize,
                                                                 charEmbeddingSize, charAct,
                                                                 name="char_convolution_layer")
            layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer])
            sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize)
        elif useSuffixFeatures or useCapFeatures:
            # Use hand-crafted features
            concatenateInputs = [flatten]
            nmFetauresByWord = wordEmbedding.getEmbeddingSize()

            if useSuffixFeatures:
                log.info("Using suffix features")

                suffixInput = T.lmatrix("suffix_input")
                suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(),
                                                name="suffix_embedding")
                suffixFlatten = FlattenLayer(suffixEmbLayer)
                concatenateInputs.append(suffixFlatten)

                nmFetauresByWord += suffixEmbedding.getEmbeddingSize()
                inputModel.append(suffixInput)

            if useCapFeatures:
                log.info("Using capitalization features")

                capInput = T.lmatrix("capitalization_input")
                capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(),
                                             name="cap_embedding")
                capFlatten = FlattenLayer(capEmbLayer)
                concatenateInputs.append(capFlatten)

                nmFetauresByWord += capEmbedding.getEmbeddingSize()
                inputModel.append(capInput)

            layerBeforeLinear = ConcatenateLayer(concatenateInputs)
            sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord
        else:
            # Use only the word embeddings
            layerBeforeLinear = flatten
            sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize()

        # The rest of the NN
        if args.with_hidden:
            hiddenActFunction = method_name(hiddenActFunctionName)
            weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform()

            linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize,
                                  weightInitialization=weightInit, name="linear1")
            act1 = ActivationLayer(linear1, hiddenActFunction)

            layerBeforeSoftmax = act1
            sizeLayerBeforeSoftmax = hiddenLayerSize
            log.info("Using hidden layer")
        else:
            layerBeforeSoftmax = layerBeforeLinear
            sizeLayerBeforeSoftmax = sizeLayerBeforeLinear
            log.info("Not using hidden layer")

        linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(),
                              weightInitialization=ZeroWeightGenerator(),
                              name="linear_softmax")
        act2 = ActivationLayer(linear2, softmax)
        prediction = ArgmaxPrediction(1).predict(act2.getOutput())

    # Load the model
    if args.load_model:
        alreadyLoaded = set([wordEmbeddingLayer])

        for o in (act2.getLayerSet() - alreadyLoaded):
            if o.getName():
                persistentManager.load(o)

    # Set the input and output
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)]

    if withCharWNN:
        inputGenerators.append(
            CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                     startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                     filters=charFilters))
    else:
        if useSuffixFeatures:
            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

        if useCapFeatures:
            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

    outputGenerator = LabelGenerator(labelLexicon)

    if args.train:
        log.info("Reading training examples")

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize,
                                        shuffle=shuffle)

        # Get dev inputs and output
        dev = args.dev

        if dev:
            log.info("Reading development examples")
            devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator)
            devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint,
                                          shuffle=False)
        else:
            devReader = None
    else:
        trainReader = None
        devReader = None

    y = T.lvector("y")

    if args.decay.lower() == "normal":
        decay = 0.0
    elif args.decay.lower() == "divide_epoch":
        decay = 1.0

    if args.adagrad:
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    else:
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)

    # Printing embedding information
    dictionarySize = wordEmbedding.getNumberOfVectors()

    log.info("Size of  word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize))

    if withCharWNN:
        log.info("Size of  char dictionary and char embedding size: %d and %d" % (
            charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize()))

    if useSuffixFeatures:
        log.info("Size of  suffix dictionary and suffix embedding size: %d and %d" % (
            suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize()))

    if useCapFeatures:
        log.info("Size of  capitalization dictionary and capitalization embedding size: %d and %d" % (
            capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize()))

    # Compiling
    loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y)

    if args.lambda_L2:
        _lambda = args.lambda_L2
        log.info("Using L2 with lambda= %.2f", _lambda)
        loss += _lambda * (T.sum(T.square(linear1.getParameters()[0])))

    trainMetrics = [
        LossMetric("LossTrain", loss, True),
        AccuracyMetric("AccTrain", y, prediction),
    ]

    evalMetrics = [
        LossMetric("LossDev", loss, True),
        AccuracyMetric("AccDev", y, prediction),
    ]

    testMetrics = [
        LossMetric("LossTest", loss, True),
        AccuracyMetric("AccTest", y, prediction),
    ]

    wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics,
                          evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None)
    # Training
    if trainReader:
        callback = []

        if args.save_model:
            savePath = args.save_model
            objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon]

            if withCharWNN:
                objsToSave.append(charLexicon)

            if useSuffixFeatures:
                objsToSave.append(suffixLexicon)

            if useCapFeatures:
                objsToSave.append(capLexicon)

            modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad)

            # Save the model with best acc in dev
            if args.save_by_acc:
                callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True))

        log.info("Training")
        wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback)

        # Save the model at the end of training
        if args.save_model and not args.save_by_acc:
            modelWriter.save()

    # Testing
    if args.test:
        log.info("Reading test examples")
        testDatasetReader = TokenLabelReader(args.test, args.token_label_separator)
        testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False)

        log.info("Testing")
        wnnModel.test(testReader)

        if args.print_prediction:
            f = codecs.open(args.print_prediction, "w", encoding="utf-8")

            for x, labels in testReader:
                inputs = x

                predictions = wnnModel.prediction(inputs)

                for prediction in predictions:
                    f.write(labelLexicon.getLexicon(prediction))
                    f.write("\n")