Exemplo n.º 1
0
    def runDNNTrain(self,
                    learningRate=0.01,
                    hiddenUnitSize=[100, 100],
                    dataKey='Question',
                    labelKey='y'):
        corpusTrain = self.prepareDF(excelLocation=TRAIN_EXCEL)

        dnnObject = DNN(pd_df_train=corpusTrain,
                        pd_df_test=None,
                        learning_rate=learningRate,
                        hidden_units_size=hiddenUnitSize,
                        dataKey=dataKey,
                        labelKey=labelKey)
        result = dnnObject.run()
        return result
Exemplo n.º 2
0
class TrainTieBot:

    def __init__(self):
        self.bagOfWords = None
        self.wordFeatures = None

    def tokenize(self, corpus):
        if not isinstance(corpus, dict):
            raise(TrainTieBotException('Corpus must be of type:dict()'))
        # # tokenize each sentence
        tokenizer = RegexpTokenizer(r'\w+')
        token = [str(x) for x in tokenizer.tokenize(corpus.values()[0].lower())]
        return token

    def cleanUpQuery(self, sentence):
        tokenizer = RegexpTokenizer(r'\w+')
        sentence = sentence.lower()
        return tokenizer.tokenize(sentence)

    def getFeaturesByName(self, corpus):
        words = list()
        if not isinstance(corpus, list):
            raise(TrainTieBotException('Corpus must be of type:list()'))
        for x in corpus:
            words.extend(x.values()[0])
        self.wordFeatures = list(set(words))
        return self.wordFeatures

    def BOW(self, corpus):
        if not isinstance(corpus, list):
            raise(TrainTieBotException('Corpus must be of type:list()'))
        tokenDict = [{x.keys()[0]: self.tokenize(x)} for x in corpus]
        getWordFeatures = self.getFeaturesByName(tokenDict)
        vectorDataFrame = pd.DataFrame(data=np.zeros([len(corpus), len(getWordFeatures)]).astype(int))
        # add a label column
        labels = [x.keys()[0] for x in corpus]
        for word in vectorDataFrame.index.tolist():
            vectorDataFrame.loc[word, :] = \
                [corpus[word].values()[0].count(item) if item in corpus[word].values()[0] else 0 for item in getWordFeatures]
        vectorDataFrame['y'] = labels
        self.bagOfWords = vectorDataFrame
        return self.bagOfWords

    def BOWFit(self, query):
        if self.bagOfWords is None:
            raise (TrainTieBotException('Create Bag of Words vectors before fitting it to a new query.'))
        tokenDict = [{x.keys()[0]: self.tokenize(x)} for x in query]
        arrayFitDf = pd.DataFrame(data=np.zeros([len(tokenDict), len(self.wordFeatures)])).astype(int)
        arrayFitDf['y'] = [x.keys()[0] for x in query]
        for i in range(len(query)):
            arrayFitDf.iloc[i, :-1] = [query[i].values()[0].count(item) if item in query[i].values()[0] else 0 for item in self.wordFeatures]
        return arrayFitDf

    def list2df(self, data, dataKey, labelKey):
        df = {}
        df[dataKey] = []
        df[labelKey] = []
        for obj in data:
            df[dataKey].append(obj.values()[0])
            df[labelKey].append(obj.keys()[0])
        return pd.DataFrame.from_dict(df)

    def df2list(self, df, dataKey, labelKey):
        df = df[[dataKey, labelKey]]
        return [{df[labelKey][x]:df[dataKey][x]} for x in range(df.shape[0])]

    def prepareDF(self, corpus=None, dataKey='Question', labelKey='y', excelLocation=None):
        if corpus is None:
            corpusObj = Corpus()
            corpusT = corpusObj.loadData(excelLocation)
            corpusT = self.df2list(corpusT, dataKey, labelKey)
            # Expand Vocabulary list with part of speeches
            corpusT = corpusObj.getExpandedSentences(corpusT)
            # #print "corpus test:"
            corpusT = self.list2df(corpusT, dataKey, labelKey)
        else:
            corpusObj = Corpus()
            corpusT = corpusObj.getExpandedSentences(corpus)
            corpusT = self.list2df(corpusT, dataKey, labelKey)
        return corpusT


    def getX(self, corpus=None):
        dataKey = 'Question'
        labelKey = 'y'
        corpusObj = Corpus()
        if corpus is None:
            corpusTrain = corpusObj.loadData(TRAIN_EXCEL)
            corpusTrain = self.df2list(corpusTrain, dataKey, labelKey)
            corpusTrain = corpusObj.getExpandedSentences(corpusTrain)
            BOWTrain = self.BOW(corpusTrain)

            X = BOWTrain.iloc[:, :-1]
            y = BOWTrain.iloc[:, -1]
        else:
            corpusTrain = corpusObj.getExpandedSentences(corpus)
            BOWTrain = self.BOW(corpusTrain)
            X = BOWTrain.iloc[:, :-1]
            y = BOWTrain.iloc[:, -1]
        return X, y

    def gety(self, query):
        dataKey = 'Question'
        labelKey = 'y'
        corpusTestObj = Corpus()
        corpusTest = corpusTestObj.getExpandedSentences(query)
        BOWTest = self.BOWFit(corpusTest)
        X_test = BOWTest.iloc[:, :-1]
        y_test = BOWTest[labelKey]
        return X_test, y_test

    def runDNNTrain(self, corpus=None, learningRate=0.02, hiddenUnitSize=[128, 256, 128], dataKey='Question', labelKey='y'):
        if corpus is None:
            corpus = self.prepareDF(excelLocation=TRAIN_EXCEL)

        self.dnnObject = DNN(pd_df_train=corpus,
                        pd_df_test=None,
                        learning_rate=learningRate,
                        hidden_units_size=hiddenUnitSize,
                        dataKey=dataKey,
                        labelKey=labelKey)
        result = self.dnnObject.run()
        return result


    def runSVM(self, corpus=None):
        X, y = self.getX(corpus)
        svm = SVMClassifier(X, y)
        clf = svm.train()
        #y_pred = clf.predict(X)
        #y_test_pred = clf.predict(X_test)
        #result1 = float(sum((y == y_pred) + 0)) / y_pred.shape[0]
        #result2 = float(sum((y_test == y_test_pred) + 0)) / y_test_pred.shape[0]
        # print 'SVM Training set accuracy: ', result1
        # print 'SVM Test set accuracy: ', result2
        return clf

    def runLR(self, corpus=None, query=None):
        X, y = self.getX(corpus)
        lr = LogisticRegClassifier(X, y)
        clf = lr.train()
        #y_pred = clf.predict(X)
        #y_test_pred = clf.predict(X_test)
        #result1 = float(sum((y == y_pred) + 0))/y_pred.shape[0]
        #result2 = float(sum((y_test == y_test_pred) + 0))/y_test_pred.shape[0]
        # print 'LR Training set accuracy: ', result1
        # print 'LR Test set accuracy: ', result2
        return clf

    def getAnswer(self, answer_index):
        answer = Corpus().loadData(TRAIN_EXCEL)
        return answer['Answer'][answer_index]