Exemplo n.º 1
0
    def loadSentenceStructureNormals(self):
        if(self._nloTextData != None):
            ConsoleOutput.printGreen("Beginning sentence structure parse...")

            SentenceSize = self._nloTextData.sentenceSize
            # Break file into learnign sequences with defined targets
            for index in range(0, SentenceSize):
                trainSequence = []
                target = None
                if(index == SentenceSize - (self._TrainRangeSS)):
                    break
                for i in range(0, self._TrainRangeSS+1):
                    # At the end of the sequence, so must be the target
                    if(i == self._TrainRangeSS):
                        target = self._nloTextData.sentenceNormalised[index + i]
                        break
                    trainSequence.append(self._nloTextData.sentenceNormalised[index + i])
                # Make sure we dont input the correct vector sizes into the network
                if(len(trainSequence) != self._TrainRangeSS):
                    raise ValueError('Train sequence vector not equal to _TrainRangeSS: ' + str(trainSequence))
                self._TrainingSequenceSS.append(trainSequence)
                self._TrainingTargetsSS.append(target)
            else:
                raise ValueError('Need to load data via loadFromTextFile() before calling function.')

        ConsoleOutput.printGreen("Data normalised successful...")
        return True
Exemplo n.º 2
0
    def loadSentenceStructureNormals(self):
        if (self._nloTextData != None):
            ConsoleOutput.printGreen("Beginning sentence structure parse...")

            SentenceSize = self._nloTextData.sentenceSize
            # Break file into learnign sequences with defined targets
            for index in range(0, SentenceSize):
                trainSequence = []
                target = None
                if (index == SentenceSize - (self._TrainRangeSS)):
                    break
                for i in range(0, self._TrainRangeSS + 1):
                    # At the end of the sequence, so must be the target
                    if (i == self._TrainRangeSS):
                        target = self._nloTextData.sentenceNormalised[index +
                                                                      i]
                        break
                    trainSequence.append(
                        self._nloTextData.sentenceNormalised[index + i])
                # Make sure we dont input the correct vector sizes into the network
                if (len(trainSequence) != self._TrainRangeSS):
                    raise ValueError(
                        'Train sequence vector not equal to _TrainRangeSS: ' +
                        str(trainSequence))
                self._TrainingSequenceSS.append(trainSequence)
                self._TrainingTargetsSS.append(target)
            else:
                raise ValueError(
                    'Need to load data via loadFromTextFile() before calling function.'
                )

        ConsoleOutput.printGreen("Data normalised successful...")
        return True
Exemplo n.º 3
0
 def loadVocabularyNormals(self, NNV):
     if (self._nloTextData != None):
         ConsoleOutput.printGreen("Beginning sentence vocabulary parse...")
         # create vocabulary with the same amount of rows as the identifiers
         vocabulary = [
             list() for _ in range(len(NaturalLanguageObject._Identifiers))
         ]
         tempNonUniqueVocab = [
             list() for _ in range(len(NaturalLanguageObject._Identifiers))
         ]
         # Build a vocabulary from the input data
         # all elements apart from first few
         for wordIndex, x in enumerate(
                 self._nloTextData.sentenceTokenList[self._TrainRangeV:]):
             wordToken = self._nloTextData.sentenceTokenList[wordIndex][1]
             word = self._nloTextData.sentenceTokenList[wordIndex][0]
             prevTokenNormal = self._nloTextData.sentenceNormalised[
                 wordIndex - 1]
             # find which colum to insert into
             for iIndex, iden in enumerate(
                     NaturalLanguageObject._Identifiers):
                 # find colum
                 if (iden == wordToken):
                     #find if combination of identifier and word already exist
                     if (prevTokenNormal, word) not in vocabulary[iIndex]:
                         # unique sequences will be stored in the vocabulary for lookups
                         # when converting from normals back into words
                         vocabulary[iIndex].append((prevTokenNormal, word))
                     else:
                         # get the non-unique combinations (purely for training)
                         tempNonUniqueVocab[iIndex].append(
                             (prevTokenNormal, word))
         # Use unique sequences to generate normals
         for index, val in enumerate(vocabulary):
             # Calculate the normals for each row
             normalisedUnit = 0
             if (len(vocabulary[index]) > 0):
                 normalisedUnit = 2 / len(vocabulary[index])
             for index2, vector in enumerate(vocabulary[index]):
                 tmpNormal = round(float(((index2 + 1) * normalisedUnit)),
                                   10)
                 word = vector[1]
                 prevNormal = vector[0]
                 # pass into the network fit buffer (THESE ARE THE UNIQUE COMBINATIONS)
                 NNV.loadVectorsIntoNetworkByIndex(index, prevNormal,
                                                   tmpNormal)
                 NNV.loadVocab(index, tmpNormal, word)
                 # check non-unique for same sequence
                 for iNU, nonUniqueVal in enumerate(
                         tempNonUniqueVocab[index]):
                     # if there are non-unique sequences then add to training
                     if (prevNormal,
                             word) == tempNonUniqueVocab[index][iNU]:
                         NNV.loadVectorsIntoNetworkByIndex(
                             index, prevNormal, tmpNormal)
                         NNV.loadVocab(index, tmpNormal, word)
     else:
         raise ValueError(
             'Need to load data via loadFromTextFile() before calling function.'
         )
Exemplo n.º 4
0
 def loadTextFromFile(self, InputFile):
     ConsoleOutput.printGreen("Loading text data from: (" + InputFile + ")")
     # Convert to natural language object
     sentence = []
     for line in open(InputFile, 'r', encoding='UTF-8'):
         line = self.thu.cut(line.strip(), text=True)
         sentence.extend(line.split())
     ConsoleOutput.printGreen("Data load successful. WordCount: " +
                              str(len(sentence)))
     self._nloTextData = NaturalLanguageObject(sentence)
Exemplo n.º 5
0
    def FitNetwork(self):
        countItems = len(self.trainingDataResults)

        self._fit(self.trainingData, self.trainingDataResults)

        ConsoleOutput.printGreen("Data successfully fitted to the sentence structure network.")
        ConsoleOutput.printGreen("Vectors: " + str(countItems))

        self.trainingData = None
        self.trainingDataResults = None
Exemplo n.º 6
0
    def FitNetwork(self):
        countItems = len(self.trainingDataResults)

        self._fit(self.trainingData, self.trainingDataResults)

        ConsoleOutput.printGreen("Data successfully fitted to the sentence structure network.")
        ConsoleOutput.printGreen("Vectors: " + str(countItems))

        self.trainingData = None
        self.trainingDataResults = None
Exemplo n.º 7
0
 def loadTextFromFile(self, InputFile):
     ConsoleOutput.printGreen("Loading text data from: (" + InputFile + ")")
     sentence = []
     # Convert to natural language object
     for line in open(InputFile):
         #line = line.lower()
         # remove completely
         line = line.replace('"', '')
         line = line.replace("'", '')
         # seperate punctuation from eachother so they have seprate tokens
         line = re.sub( r'(.)([,.!?:;"()\'\"])', r'\1 \2', line)
         # seperate from both directions
         line = re.sub( r'([,.!?:;"()\'\"])(.)', r'\1 \2', line)
         sentence.extend(line.split())
     ConsoleOutput.printGreen("Data load successful. WordCount: " + str(len(sentence)))
     self._nloTextData = NaturalLanguageObject(sentence)
Exemplo n.º 8
0
 def loadTextFromFile(self, InputFile):
     ConsoleOutput.printGreen("Loading text data from: (" + InputFile + ")")
     sentence = []
     # Convert to natural language object
     for line in open(InputFile):
         #line = line.lower()
         # remove completely
         line = line.replace('"', '')
         line = line.replace("'", '')
         # seperate punctuation from eachother so they have seprate tokens
         line = re.sub(r'(.)([,.!?:;"()\'\"])', r'\1 \2', line)
         # seperate from both directions
         line = re.sub(r'([,.!?:;"()\'\"])(.)', r'\1 \2', line)
         sentence.extend(line.split())
     ConsoleOutput.printGreen("Data load successful. WordCount: " +
                              str(len(sentence)))
     self._nloTextData = NaturalLanguageObject(sentence)
Exemplo n.º 9
0
 def loadVocabularyNormals(self, NNV):
     if(self._nloTextData != None):
         ConsoleOutput.printGreen("Beginning sentence vocabulary parse...")
         # create vocabulary with the same amount of rows as the identifiers
         vocabulary = [list() for _ in range(len(NaturalLanguageObject._Identifiers))]
         tempNonUniqueVocab = [list() for _ in range(len(NaturalLanguageObject._Identifiers))]
         # Build a vocabulary from the input data
         # all elements apart from first few
         for wordIndex, x in enumerate(self._nloTextData.sentenceTokenList[self._TrainRangeV:]):
             wordToken = self._nloTextData.sentenceTokenList[wordIndex][1]
             word = self._nloTextData.sentenceTokenList[wordIndex][0]
             prevTokenNormal = self._nloTextData.sentenceNormalised[wordIndex-1]
             # find which colum to insert into
             for iIndex, iden in enumerate(NaturalLanguageObject._Identifiers):
                 # find colum
                 if(iden == wordToken):
                     #find if combination of identifier and word already exist
                     if (prevTokenNormal, word) not in vocabulary[iIndex]:
                         # unique sequences will be stored in the vocabulary for lookups
                         # when converting from normals back into words
                         vocabulary[iIndex].append((prevTokenNormal, word))
                     else:
                         # get the non-unique combinations (purely for training)
                         tempNonUniqueVocab[iIndex].append((prevTokenNormal, word))
         # Use unique sequences to generate normals
         for index, val in enumerate(vocabulary):
             # Calculate the normals for each row
             normalisedUnit = 0
             if(len(vocabulary[index])>0):
                 normalisedUnit = 2/len(vocabulary[index])
             for index2, vector in enumerate(vocabulary[index]):
                 tmpNormal = round(float(((index2+1) * normalisedUnit)), 10)
                 word = vector[1]
                 prevNormal = vector[0]
                 # pass into the network fit buffer (THESE ARE THE UNIQUE COMBINATIONS)
                 NNV.loadVectorsIntoNetworkByIndex(index, prevNormal, tmpNormal)
                 NNV.loadVocab(index, tmpNormal, word)
                 # check non-unique for same sequence
                 for iNU, nonUniqueVal in enumerate(tempNonUniqueVocab[index]):
                     # if there are non-unique sequences then add to training
                     if (prevNormal, word) == tempNonUniqueVocab[index][iNU]:
                         NNV.loadVectorsIntoNetworkByIndex(index, prevNormal, tmpNormal)
                         NNV.loadVocab(index, tmpNormal, word)
     else:
         raise ValueError('Need to load data via loadFromTextFile() before calling function.')
Exemplo n.º 10
0
    def loadTextFromFile_backup(self, InputFile):
        ConsoleOutput.printGreen("Loading text data from: (" + InputFile + ")")
        sentence = []
        # Convert to natural language object
        for line in open(InputFile, 'r', encoding='UTF-8'):
            #line = line.lower()
            # remove completely
            line = line.replace('"', '')
            line = line.replace("'", '')
            # seperate punctuation from each other so they have seprate tokens
            #line = re.sub( r'(.)([,.!?:;"()\'\"])', r'\1 \2', line)
            # seperate from both directions
            #line = re.sub( r'([,.!?:;"()\'\"])(.)', r'\1 \2', line)

            #sub函数第一个参数是要匹配的模式,第二个参数是要替换成的目标,第三个参数是要被正则匹配的源
            line = re.sub(r'(.)([,。!?:“()‘”“’])', r'\1 \2', line)
            line = re.sub(r'([,。!?:;”()“”‘’])(.)', r'\1 \2', line)
            sentence.extend(line.split())
        ConsoleOutput.printGreen("Data load successful. WordCount: " +
                                 str(len(sentence)))
        self._nloTextData = NaturalLanguageObject(sentence)
Exemplo n.º 11
0
    def FitNetwork(self):
        countItems = 0
        # print('size of trainingData[6] is:')
        # print(len(self.trainingData[6]))
        # print(self.trainingData[6])
        # train all of the networks at once
        # trainingData[] 是二维数组,
        # len(trainingData) 是 45
        # trainingData[6]的size是2233
        for index, val in enumerate(self.trainingData):
            if (len(self.trainingData[index]) > 0):
                self._fit(index, self.trainingData[index],
                          self.trainingDataResults[index])
                countItems = countItems + len(self.trainingData[index])
            else:
                ConsoleOutput.printRed(
                    "No training data for vocab identifier: " +
                    NaturalLanguageObject._Identifiers[index])

        ConsoleOutput.printGreen(
            "Data successfully fitted to the vocabulary network.")
        ConsoleOutput.printGreen("Vectors: " + str(countItems))
        print("\n")

        self.trainingData = None
        self.trainingDataResults = None
Exemplo n.º 12
0
    def FitNetwork(self):
        countItems = 0
        # train all of the networks at once
        for index, val in enumerate(self.trainingData):
            if(len(self.trainingData[index]) > 0):
                self._fit(index, self.trainingData[index], self.trainingDataResults[index])
                countItems = countItems + len(self.trainingData[index])
            else:
                ConsoleOutput.printRed("No training data for vocab identifier: " + NaturalLanguageObject._Identifiers[index])

        ConsoleOutput.printGreen("Data successfully fitted to the vocabulary network.")
        ConsoleOutput.printGreen("Vectors: " + str(countItems))
        print("\n")

        self.trainingData = None
        self.trainingDataResults = None
Exemplo n.º 13
0
    def TestVocabulary(self):
        #testingPara = testingParaHarryPotter
        testingPara = self._TestingPara
        passedTests = []
        nonFatalTests = []
        failedTests = []

        # Build a test sequence form each word
        for index, val in enumerate(self._TestingParaNlo.sentenceTokenList[1:]):
            prevWord = self._TestingParaNlo.sentenceTokenList[index-1][0]
            prevWordToken = self._TestingParaNlo.sentenceTokenList[index-1][1]
            prevWordTokenNormal = self._TestingParaNlo.sentenceNormalised[index-1]

            curWord = val[0]
            curToken = val[1]
            curNormal = self._TestingParaNlo.sentenceNormalised[index]

            prediction = self.neuralNetworkV.getPredictedWord(prevWordTokenNormal, curToken)
            probList = self.neuralNetworkV.getPredictionProbability(prevWordTokenNormal, curToken)

            prob = 0
            for val in probList[0]:
                if(val > prob):
                    prob = val

            if(str(curWord.lower()) == str(prediction).lower()):
                passedTests.append("("+str(prevWord)+", "+str(prevWordToken)+")        Target: "+str(curWord)+"        Pred: "+str(prediction)+"   " + str(prob*100) + "%")
            else:
                if(prob < 0.2):
                    failedTests.append("("+str(prevWord)+", "+str(prevWordToken)+")        Target: "+str(curWord)+"        Pred: "+str(prediction)+"    " + str(prob*100) + "%")
                elif (prob > 0.6):
                    passedTests.append("("+str(prevWord)+", "+str(prevWordToken)+")        Target: "+str(curWord)+"        Pred: "+str(prediction)+"   " + str(prob*100) + "%")
                else:
                    nonFatalTests.append("("+str(prevWord)+", "+str(prevWordToken)+")        Target: "+str(curWord)+"        Pred: "+str(prediction)+"    " + str(prob*100) + "%")

        # print results
        print("\n")
        print("********** TestSentenceStructuring() **********")
        print("\n")

        ConsoleOutput.printUnderline("Failed Tests: (" + str(len(failedTests)) + "/" + str(len(testingPara)) + ")")
        for val in failedTests:
            ConsoleOutput.printRed(val)
        print("\n")
        ConsoleOutput.printUnderline("Non-Fatal failed Tests: (" + str(len(nonFatalTests)) + "/" + str(len(testingPara)) + ")")
        for val in nonFatalTests:
            ConsoleOutput.printYellow(val)
        print("\n")
        ConsoleOutput.printUnderline("Passed Tests: (" + str(len(passedTests)) + "/" + str(len(testingPara)) + ")")
        for val in passedTests:
            ConsoleOutput.printGreen(val)
        print("\n")

        ConsoleOutput.printYellow("Passed: " + str(len(passedTests)) + "   Non-Fatals: " + str(len(nonFatalTests)) + "   Fails: " + str(len(failedTests)))
        print("\n")
Exemplo n.º 14
0
def Main():
    _isUnitTestingSS = False
    _isUnitTestingV = False
    _recursiveInput = False
    _TrainingDataInputFile = "Datasets/HarryPotter(xxlarge).txt"
    _TestSentence = ""
    _TestSequenceGenSize = 30

    consoleInArgs = sys.argv[1:]
    # check input arguments
    for index, val in enumerate(consoleInArgs):
        # Runs the unit testing module on initiation
        if(val == "-utss"):
            _isUnitTestingSS = True
        # Unit testing for the vocabulary network
        elif(val == "-utv"):
            _isUnitTestingV = True
        elif(len(consoleInArgs) >= index+1):
            # specify training data location
            if(val == "-td"):
                _TrainingDataInputFile = consoleInArgs[index+1]
                ConsoleOutput.printGreen("Training data load locaiton changed to: \"" + _TrainingDataInputFile + "\"")
            # give a generation sentence input
            elif(val == "-ts"):
                _TestSentence = consoleInArgs[index+1]
                if(len(_TestSentence.split()) != _TrainRangeSS):
                    raise ValueError('Test sequence must be the same length as the vector training size. (' + str(_TrainRangeSS) + ')')
            # set the amount of words generated after input
            elif(val == "-tsc"):
                _TestSequenceGenSize = int(consoleInArgs[index+1])
        else:
            raise ValueError('Un-recognized console argument: ' + str(val))
    # Initialise colorama cross-platform console logging
    init()

    neuralNetworkSS = NNSentenceStructure()
    neuralNetworkV = NNVocabulary()
    # Network trainer converts text data into normalized vectors that
    # can be passed into the networks
    networkTrainer = NetworkTrainer(_TrainRangeSS, _TrainRangeV)
    networkTrainer.loadTextFromFile(_TrainingDataInputFile)
    # Trainer parses the structure into vector normal arrays of size (_TrainRangeSS)
    # the next word of the squence is used as the target, example
    # ["Harry", "sat", "on", "his"] - ["broomstick"] <-- target
    networkTrainer.loadSentenceStructureNormals()
    networkTrainer.loadVocabularyNormals(neuralNetworkV)
    # Pass the vectors into the network
    neuralNetworkSS.loadVectorsIntoNetwork(networkTrainer._TrainingSequenceSS, networkTrainer._TrainingTargetsSS)
    # Passs into vocab network here ****

    # Fit data
    neuralNetworkSS.FitNetwork()
    neuralNetworkV.FitNetwork()
    # Fit to vocab network here ****

    # Use console argument "-utss" to activate
    #testing
    uTester = None
    if(_isUnitTestingSS):
        if(uTester == None):
            uTester = UnitTester(neuralNetworkSS, neuralNetworkV, _TrainRangeSS, _TrainRangeV)
        uTester.TestSentenceStructuring()
    # use console argument "-utv" to activate
    if(_isUnitTestingV):
        if(uTester == None):
            uTester = UnitTester(neuralNetworkSS, neuralNetworkV, _TrainRangeSS, _TrainRangeV)
        uTester.TestVocabulary()

    if(_TestSentence != ""):
        genSize = _TestSequenceGenSize
        initialInput = _TestSentence
        print(initialInput + " ", end="")
        initialInput = initialInput.split()
        # generate a sentence of genSize
        for index in range(0, genSize):
            nlo = NaturalLanguageObject(initialInput)
            # since nlo will always be the right size, we can use that variable
            predToke = neuralNetworkSS.getPrediction(nlo.sentenceNormalised)
            nextToke = nlo.tokeniseNormals([predToke])
            # now we have the next toke in the sentence, convert that to word
            word = neuralNetworkV.getPredictedWord(nlo.sentenceNormalised[-1], nextToke[0])
            print(str(word) + " ", end="")
            initialInput.append(word)
            # maintain a size of 'genSize'
            del initialInput[0]
        print("\n")
    # Reset console back to original state
    deinit()
Exemplo n.º 15
0
def Main():
    _isUnitTestingSS = False
    _isUnitTestingV = False
    _recursiveInput = False
    _TrainingDataInputFile = "Datasets/Sstt.utf8.txt"
    _TestSentence = ""
    _TestSequenceGenSize = 30
    _OutputFile = None

    consoleInArgs = sys.argv[1:]
    # check input arguments
    for index, val in enumerate(consoleInArgs):
        # Runs the unit testing module on initiation
        if(val == "-utss"):
            _isUnitTestingSS = True
        # Unit testing for the vocabulary network
        elif(val == "-utv"):
            _isUnitTestingV = True
        elif(len(consoleInArgs) >= index+1):
            # specify training data location
            if(val == "-td"):
                _TrainingDataInputFile = consoleInArgs[index+1]
                ConsoleOutput.printGreen("Training data load locaiton changed to: \"" + _TrainingDataInputFile + "\"")
            # give a generation sentence input
            elif(val == "-ts"):
                _TestSentence = consoleInArgs[index+1]
                if(len(_TestSentence.split()) != _TrainRangeSS):
                    raise ValueError('Test sequence must be the same length as the vector training size. (' + str(_TrainRangeSS) + ')')
            # set the amount of words generated after input
            elif(val == "-tsc"):
                _TestSequenceGenSize = int(consoleInArgs[index+1])
                ConsoleOutput.printGreen("Test sequence generation size changed to: " + str(_TestSequenceGenSize))
            # set the output file for the generated data to be printed to
            elif(val == "-of"):
                _OutputFile = str(consoleInArgs[index+1])
                ConsoleOutput.printGreen("Output generation location changed to: (" + consoleInArgs[index+1]+ ")")
        else:
            raise ValueError('Un-recognized console argument: ' + str(val))
    # Initialise colorama cross-platform console logging
    init()

    MLNetworkSS = NNSentenceStructure()
    MLNetworkV = NNVocabulary()
    # Network trainer converts text data into normalized vectors that
    # can be passed into the networks
    networkTrainer = NetworkTrainer(_TrainRangeSS, _TrainRangeV)
    networkTrainer.loadTextFromFile(_TrainingDataInputFile)
    # Trainer parses the structure into vector normal arrays of size (_TrainRangeSS)
    # the next word of the squence is used as the target, example
    # ["Harry", "sat", "on", "his"] - ["broomstick"] <-- target
    networkTrainer.loadSentenceStructureNormals()
    networkTrainer.loadVocabularyNormals(MLNetworkV)
    # Pass the vectors into the network
    MLNetworkSS.loadVectorsIntoNetwork(networkTrainer._TrainingSequenceSS, networkTrainer._TrainingTargetsSS)
    # Passs into vocab network here ****

    # Fit data
    MLNetworkSS.FitNetwork()
    MLNetworkV.FitNetwork()
    # Fit to vocab network here ****

    # Use console argument "-utss" to activate
    #testing
    uTester = None
    if(_isUnitTestingSS):
        #if(uTester == None):
            #uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS, _TrainRangeV)
        #uTester.TestSentenceStructuring()
        print("_isUnitTestingSS is true")
    # use console argument "-utv" to activate
    if(_isUnitTestingV):
        #if(uTester == None):
            #uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS, _TrainRangeV)
        #uTester.TestVocabulary()
        print("_isUnitTestingV is true")
    if(_TestSentence != ""):
        print("_TestSentence is true")
        printToFile = False
        f = None
        # user has specified output location
        if(_OutputFile != None):
            printToFile = True
            f = open(_OutputFile,'w')
        genSize = _TestSequenceGenSize  #要生成的目标文章的大小
        initialInput = _TestSentence
        if(printToFile):
            f.write(initialInput + " ")
        else:
            print(initialInput + " ", end="")
        initialInput = initialInput.split()  # 输入的关键词分割
        # generate a sentence of genSize
        for index in range(0, genSize):
            #print(initialInput)
            nlo = NaturalLanguageObject(initialInput)
            #解决中文切词中存在的二意切词问题,为了让测试数据的维度能匹配训练数据的维度,要丢掉[('word', tag), ('word', tag),('word', tag),('word', tag),('word', tag)...]
            #头部多余的部分tuple,否则KNN分类器会报错。
            diff = len(nlo.sentenceNormalised) - _TrainRangeSS
            if(diff > 0):
                nlo.sentenceNormalised = nlo.sentenceNormalised[diff:]
            # since nlo will always be the right size, we can use that variable
            predToke = MLNetworkSS.getPrediction([nlo.sentenceNormalised])
            nextToke = nlo.tokeniseNormals([predToke])
            # now we have the next toke in the sentence, convert that to word
            word = MLNetworkV.getPredictedWord(nlo.sentenceNormalised[-1], nextToke[0])
            # decide whether to print to file or console
            if(printToFile):
                f.write(str(word) + " ")
            else:
                print(str(word) + " ", end="")
            initialInput.append(word)
            # maintain a size of 'genSize'
            del initialInput[0]
        print("\n")
    # Reset console back to original state
    deinit()
Exemplo n.º 16
0
    def TestVocabulary(self):
        #testingPara = testingParaHarryPotter
        testingPara = self._TestingPara
        passedTests = []
        nonFatalTests = []
        failedTests = []

        # Build a test sequence form each word
        for index, val in enumerate(
                self._TestingParaNlo.sentenceTokenList[1:]):
            prevWord = self._TestingParaNlo.sentenceTokenList[index - 1][0]
            prevWordToken = self._TestingParaNlo.sentenceTokenList[index -
                                                                   1][1]
            prevWordTokenNormal = self._TestingParaNlo.sentenceNormalised[index
                                                                          - 1]

            curWord = val[0]
            curToken = val[1]
            curNormal = self._TestingParaNlo.sentenceNormalised[index]

            prediction = self.neuralNetworkV.getPredictedWord(
                prevWordTokenNormal, curToken)
            probList = self.neuralNetworkV.getPredictionProbability(
                prevWordTokenNormal, curToken)

            prob = 0
            for val in probList[0]:
                if (val > prob):
                    prob = val

            if (str(curWord.lower()) == str(prediction).lower()):
                passedTests.append("(" + str(prevWord) + ", " +
                                   str(prevWordToken) + ")        Target: " +
                                   str(curWord) + "        Pred: " +
                                   str(prediction) + "   " + str(prob * 100) +
                                   "%")
            else:
                if (prob < 0.2):
                    failedTests.append("(" + str(prevWord) + ", " +
                                       str(prevWordToken) +
                                       ")        Target: " + str(curWord) +
                                       "        Pred: " + str(prediction) +
                                       "    " + str(prob * 100) + "%")
                elif (prob > 0.6):
                    passedTests.append("(" + str(prevWord) + ", " +
                                       str(prevWordToken) +
                                       ")        Target: " + str(curWord) +
                                       "        Pred: " + str(prediction) +
                                       "   " + str(prob * 100) + "%")
                else:
                    nonFatalTests.append("(" + str(prevWord) + ", " +
                                         str(prevWordToken) +
                                         ")        Target: " + str(curWord) +
                                         "        Pred: " + str(prediction) +
                                         "    " + str(prob * 100) + "%")

        # print results
        print("\n")
        print("********** TestSentenceStructuring() **********")
        print("\n")

        ConsoleOutput.printUnderline("Failed Tests: (" +
                                     str(len(failedTests)) + "/" +
                                     str(len(testingPara)) + ")")
        for val in failedTests:
            ConsoleOutput.printRed(val)
        print("\n")
        ConsoleOutput.printUnderline("Non-Fatal failed Tests: (" +
                                     str(len(nonFatalTests)) + "/" +
                                     str(len(testingPara)) + ")")
        for val in nonFatalTests:
            ConsoleOutput.printYellow(val)
        print("\n")
        ConsoleOutput.printUnderline("Passed Tests: (" +
                                     str(len(passedTests)) + "/" +
                                     str(len(testingPara)) + ")")
        for val in passedTests:
            ConsoleOutput.printGreen(val)
        print("\n")

        ConsoleOutput.printYellow("Passed: " + str(len(passedTests)) +
                                  "   Non-Fatals: " + str(len(nonFatalTests)) +
                                  "   Fails: " + str(len(failedTests)))
        print("\n")
Exemplo n.º 17
0
    def TestSentenceStructuring(self):

        #testingPara = testingParaHarryPotter
        testingPara = self._TestingPara
        passedTests = []
        nonFatalTests = []
        failedTests = []
        # used to predict accuracy of the network
        acTestPred = []
        acTestTrue = []

        # Build a test sequence form each word
        for index, val in enumerate(testingPara):
            tmpTestSeq = []
            target = None
            # grab the next 3 words after
            if (index < len(testingPara) - (self.VectorSizeSS + 1)):
                for index2 in range(0, self.VectorSizeSS):
                    tmpTestSeq.append(testingPara[index + index2])
                target = testingPara[index + self.VectorSizeSS]
                # convert to natural language object
                nloTester = NaturalLanguageObject(tmpTestSeq)
                nloTarget = NaturalLanguageObject([target])
                # get nerual network prediction
                normalPred = self.neuralNetworkSS.getPrediction(
                    nloTester.sentenceNormalised)
                prediction = str(nloTester.tokeniseNormals([normalPred]))
                comp = str(nloTarget.sentenceTags)

                cTrue = nloTarget.sentenceNormalised[0]
                acTestTrue.append(cTrue * 100)
                acTestPred.append(normalPred * 100)

                #if first letters match, this means 'NN' will match with 'NNS'
                if (prediction[2] == comp[2]):
                    #filter for probability
                    probList = self.neuralNetworkSS.getPredictionProbability(
                        nloTester.sentenceNormalised)
                    prob = 0
                    for val in probList[0]:
                        if (val > prob):
                            prob = val
                    passedTests.append(
                        str(nloTester.sentenceTokenList) + "   Target: " +
                        str(nloTarget.sentenceTokenList) + "    Prediction: " +
                        prediction + " " + str(prob * 100) + "%")
                else:
                    probList = self.neuralNetworkSS.getPredictionProbability(
                        nloTester.sentenceNormalised)
                    prob = 0
                    for val in probList[0]:
                        if (val > prob):
                            prob = val
                    # if accuracy s less than 30% add to failed list
                    if (prob < 0.3):
                        failedTests.append(
                            str(nloTester.sentenceTokenList) + "   Target: " +
                            str(nloTarget.sentenceTokenList) +
                            "    Prediction: " + prediction + " " +
                            str(prob * 100) + "%")
                    else:
                        # if probability is more than 60% its probably passed
                        if (prob > 0.6):
                            passedTests.append(
                                str(nloTester.sentenceTokenList) +
                                "   Target: " +
                                str(nloTarget.sentenceTokenList) +
                                "    Prediction: " + prediction + " " +
                                str(prob * 100) + "%")
                        else:
                            nonFatalTests.append(
                                str(nloTester.sentenceTokenList) +
                                "   Target: " +
                                str(nloTarget.sentenceTokenList) +
                                "    Prediction: " + prediction + " " +
                                str(prob * 100) + "%")

        # print results
        print("\n")
        print("********** TestSentenceStructuring() **********")
        print("\n")
        ConsoleOutput.printUnderline("Failed Tests: (" +
                                     str(len(failedTests)) + "/" +
                                     str(len(testingPara)) + ")")
        for val in failedTests:
            ConsoleOutput.printRed(val)
        print("\n")
        ConsoleOutput.printUnderline("Non-Fatal failed Tests: (" +
                                     str(len(nonFatalTests)) + "/" +
                                     str(len(testingPara)) + ")")
        for val in nonFatalTests:
            ConsoleOutput.printYellow(val)
        print("\n")
        ConsoleOutput.printUnderline("Passed Tests: (" +
                                     str(len(passedTests)) + "/" +
                                     str(len(testingPara)) + ")")
        for val in passedTests:
            ConsoleOutput.printGreen(val)
        print("\n")

        nnAccuracy = accuracy_score(
            np.array(acTestTrue).astype(int),
            np.array(acTestPred).astype(int))
        ConsoleOutput.printYellow("Passed: " + str(len(passedTests)) +
                                  "   Non-Fatals: " + str(len(nonFatalTests)) +
                                  "   Fails: " + str(len(failedTests)))
        ConsoleOutput.printYellow("NeuralNetork accuracy: " +
                                  str(round(nnAccuracy * 100, 1)) + "%")
        print("\n")
Exemplo n.º 18
0
def Main():
    _isUnitTestingSS = False
    _isUnitTestingV = False
    _recursiveInput = False
    _TrainingDataInputFile = "Datasets/HarryPotter(xxlarge).txt"
    _TestSentence = ""
    _TestSequenceGenSize = 30

    consoleInArgs = sys.argv[1:]
    # check input arguments
    for index, val in enumerate(consoleInArgs):
        # Runs the unit testing module on initiation
        if (val == "-utss"):
            _isUnitTestingSS = True
        # Unit testing for the vocabulary network
        elif (val == "-utv"):
            _isUnitTestingV = True
        elif (len(consoleInArgs) >= index + 1):
            # specify training data location
            if (val == "-td"):
                _TrainingDataInputFile = consoleInArgs[index + 1]
                ConsoleOutput.printGreen(
                    "Training data load locaiton changed to: \"" +
                    _TrainingDataInputFile + "\"")
            # give a generation sentence input
            elif (val == "-ts"):
                _TestSentence = consoleInArgs[index + 1]
                if (len(_TestSentence.split()) != _TrainRangeSS):
                    raise ValueError(
                        'Test sequence must be the same length as the vector training size. ('
                        + str(_TrainRangeSS) + ')')
            # set the amount of words generated after input
            elif (val == "-tsc"):
                _TestSequenceGenSize = int(consoleInArgs[index + 1])
        else:
            raise ValueError('Un-recognized console argument: ' + str(val))
    # Initialise colorama cross-platform console logging
    init()

    MLNetworkSS = NNSentenceStructure()
    MLNetworkV = NNVocabulary()
    # Network trainer converts text data into normalized vectors that
    # can be passed into the networks
    networkTrainer = NetworkTrainer(_TrainRangeSS, _TrainRangeV)
    networkTrainer.loadTextFromFile(_TrainingDataInputFile)
    # Trainer parses the structure into vector normal arrays of size (_TrainRangeSS)
    # the next word of the squence is used as the target, example
    # ["Harry", "sat", "on", "his"] - ["broomstick"] <-- target
    networkTrainer.loadSentenceStructureNormals()
    networkTrainer.loadVocabularyNormals(MLNetworkV)
    # Pass the vectors into the network
    MLNetworkSS.loadVectorsIntoNetwork(networkTrainer._TrainingSequenceSS,
                                       networkTrainer._TrainingTargetsSS)
    # Passs into vocab network here ****

    # Fit data
    MLNetworkSS.FitNetwork()
    MLNetworkV.FitNetwork()
    # Fit to vocab network here ****

    # Use console argument "-utss" to activate
    #testing
    uTester = None
    if (_isUnitTestingSS):
        if (uTester == None):
            uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS,
                                 _TrainRangeV)
        uTester.TestSentenceStructuring()
    # use console argument "-utv" to activate
    if (_isUnitTestingV):
        if (uTester == None):
            uTester = UnitTester(MLNetworkSS, MLNetworkV, _TrainRangeSS,
                                 _TrainRangeV)
        uTester.TestVocabulary()

    if (_TestSentence != ""):
        genSize = _TestSequenceGenSize
        initialInput = _TestSentence
        print(initialInput + " ", end="")
        initialInput = initialInput.split()
        # generate a sentence of genSize
        for index in range(0, genSize):
            nlo = NaturalLanguageObject(initialInput)
            # since nlo will always be the right size, we can use that variable
            predToke = MLNetworkSS.getPrediction(nlo.sentenceNormalised)
            nextToke = nlo.tokeniseNormals([predToke])
            # now we have the next toke in the sentence, convert that to word
            word = MLNetworkV.getPredictedWord(nlo.sentenceNormalised[-1],
                                               nextToke[0])
            print(str(word) + " ", end="")
            initialInput.append(word)
            # maintain a size of 'genSize'
            del initialInput[0]
        print("\n")
    # Reset console back to original state
    deinit()
Exemplo n.º 19
0
    def TestSentenceStructuring(self):

        #testingPara = testingParaHarryPotter
        testingPara = self._TestingPara
        passedTests = []
        nonFatalTests = []
        failedTests = []
        # used to predict accuracy of the network
        acTestPred = []
        acTestTrue = []

        # Build a test sequence form each word
        for index, val in enumerate(testingPara):
            tmpTestSeq = []
            target = None
            # grab the next 3 words after
            if(index < len(testingPara)-(self.VectorSizeSS+1)):
                for index2 in range(0, self.VectorSizeSS):
                    tmpTestSeq.append(testingPara[index+index2])
                target = testingPara[index+self.VectorSizeSS]
                # convert to natural language object
                nloTester = NaturalLanguageObject(tmpTestSeq)
                nloTarget = NaturalLanguageObject([target])
                # get nerual network prediction
                normalPred = self.neuralNetworkSS.getPrediction(nloTester.sentenceNormalised)
                prediction = str(nloTester.tokeniseNormals([normalPred]))
                comp = str(nloTarget.sentenceTags)

                cTrue = nloTarget.sentenceNormalised[0]
                acTestTrue.append(cTrue*100)
                acTestPred.append(normalPred*100)

                #if first letters match, this means 'NN' will match with 'NNS'
                if(prediction[2] == comp[2]):
                    #filter for probability
                    probList = self.neuralNetworkSS.getPredictionProbability(nloTester.sentenceNormalised)
                    prob = 0
                    for val in probList[0]:
                        if(val > prob):
                            prob = val
                    passedTests.append(str(nloTester.sentenceTokenList) + "   Target: " + str(nloTarget.sentenceTokenList) + "    Prediction: "
                    + prediction  + " " +str(prob*100) + "%")
                else:
                    probList = self.neuralNetworkSS.getPredictionProbability(nloTester.sentenceNormalised)
                    prob = 0
                    for val in probList[0]:
                        if(val > prob):
                            prob = val
                    # if accuracy s less than 30% add to failed list
                    if(prob < 0.3):
                        failedTests.append(str(nloTester.sentenceTokenList) + "   Target: " + str(nloTarget.sentenceTokenList) + "    Prediction: "
                        + prediction  + " " +str(prob*100) + "%")
                    else:
                        # if probability is more than 60% its probably passed
                        if(prob > 0.6):
                            passedTests.append(str(nloTester.sentenceTokenList) + "   Target: " + str(nloTarget.sentenceTokenList) + "    Prediction: "
                            + prediction  + " " +str(prob*100) + "%")
                        else:
                            nonFatalTests.append(str(nloTester.sentenceTokenList) + "   Target: " + str(nloTarget.sentenceTokenList) + "    Prediction: "
                            + prediction  + " " +str(prob*100) + "%")

        # print results
        print("\n")
        print("********** TestSentenceStructuring() **********")
        print("\n")
        ConsoleOutput.printUnderline("Failed Tests: (" + str(len(failedTests)) + "/" + str(len(testingPara)) + ")")
        for val in failedTests:
            ConsoleOutput.printRed(val)
        print("\n")
        ConsoleOutput.printUnderline("Non-Fatal failed Tests: (" + str(len(nonFatalTests)) + "/" + str(len(testingPara)) + ")")
        for val in nonFatalTests:
            ConsoleOutput.printYellow(val)
        print("\n")
        ConsoleOutput.printUnderline("Passed Tests: (" + str(len(passedTests)) + "/" + str(len(testingPara)) + ")")
        for val in passedTests:
            ConsoleOutput.printGreen(val)
        print("\n")

        nnAccuracy = accuracy_score(np.array(acTestTrue).astype(int), np.array(acTestPred).astype(int))
        ConsoleOutput.printYellow("Passed: " + str(len(passedTests)) + "   Non-Fatals: " + str(len(nonFatalTests)) + "   Fails: " + str(len(failedTests)))
        ConsoleOutput.printYellow("NeuralNetork accuracy: " + str(round(nnAccuracy*100,1)) + "%")
        print("\n")