def runDNNTrain(self, learningRate=0.01, hiddenUnitSize=[100, 100], dataKey='Question', labelKey='y'): corpusTrain = self.prepareDF(excelLocation=TRAIN_EXCEL) dnnObject = DNN(pd_df_train=corpusTrain, pd_df_test=None, learning_rate=learningRate, hidden_units_size=hiddenUnitSize, dataKey=dataKey, labelKey=labelKey) result = dnnObject.run() return result
class TrainTieBot: def __init__(self): self.bagOfWords = None self.wordFeatures = None def tokenize(self, corpus): if not isinstance(corpus, dict): raise(TrainTieBotException('Corpus must be of type:dict()')) # # tokenize each sentence tokenizer = RegexpTokenizer(r'\w+') token = [str(x) for x in tokenizer.tokenize(corpus.values()[0].lower())] return token def cleanUpQuery(self, sentence): tokenizer = RegexpTokenizer(r'\w+') sentence = sentence.lower() return tokenizer.tokenize(sentence) def getFeaturesByName(self, corpus): words = list() if not isinstance(corpus, list): raise(TrainTieBotException('Corpus must be of type:list()')) for x in corpus: words.extend(x.values()[0]) self.wordFeatures = list(set(words)) return self.wordFeatures def BOW(self, corpus): if not isinstance(corpus, list): raise(TrainTieBotException('Corpus must be of type:list()')) tokenDict = [{x.keys()[0]: self.tokenize(x)} for x in corpus] getWordFeatures = self.getFeaturesByName(tokenDict) vectorDataFrame = pd.DataFrame(data=np.zeros([len(corpus), len(getWordFeatures)]).astype(int)) # add a label column labels = [x.keys()[0] for x in corpus] for word in vectorDataFrame.index.tolist(): vectorDataFrame.loc[word, :] = \ [corpus[word].values()[0].count(item) if item in corpus[word].values()[0] else 0 for item in getWordFeatures] vectorDataFrame['y'] = labels self.bagOfWords = vectorDataFrame return self.bagOfWords def BOWFit(self, query): if self.bagOfWords is None: raise (TrainTieBotException('Create Bag of Words vectors before fitting it to a new query.')) tokenDict = [{x.keys()[0]: self.tokenize(x)} for x in query] arrayFitDf = pd.DataFrame(data=np.zeros([len(tokenDict), len(self.wordFeatures)])).astype(int) arrayFitDf['y'] = [x.keys()[0] for x in query] for i in range(len(query)): arrayFitDf.iloc[i, :-1] = [query[i].values()[0].count(item) if item in query[i].values()[0] else 0 for item in self.wordFeatures] return arrayFitDf def list2df(self, data, dataKey, labelKey): df = {} df[dataKey] = [] df[labelKey] = [] for obj in data: df[dataKey].append(obj.values()[0]) df[labelKey].append(obj.keys()[0]) return pd.DataFrame.from_dict(df) def df2list(self, df, dataKey, labelKey): df = df[[dataKey, labelKey]] return [{df[labelKey][x]:df[dataKey][x]} for x in range(df.shape[0])] def prepareDF(self, corpus=None, dataKey='Question', labelKey='y', excelLocation=None): if corpus is None: corpusObj = Corpus() corpusT = corpusObj.loadData(excelLocation) corpusT = self.df2list(corpusT, dataKey, labelKey) # Expand Vocabulary list with part of speeches corpusT = corpusObj.getExpandedSentences(corpusT) # #print "corpus test:" corpusT = self.list2df(corpusT, dataKey, labelKey) else: corpusObj = Corpus() corpusT = corpusObj.getExpandedSentences(corpus) corpusT = self.list2df(corpusT, dataKey, labelKey) return corpusT def getX(self, corpus=None): dataKey = 'Question' labelKey = 'y' corpusObj = Corpus() if corpus is None: corpusTrain = corpusObj.loadData(TRAIN_EXCEL) corpusTrain = self.df2list(corpusTrain, dataKey, labelKey) corpusTrain = corpusObj.getExpandedSentences(corpusTrain) BOWTrain = self.BOW(corpusTrain) X = BOWTrain.iloc[:, :-1] y = BOWTrain.iloc[:, -1] else: corpusTrain = corpusObj.getExpandedSentences(corpus) BOWTrain = self.BOW(corpusTrain) X = BOWTrain.iloc[:, :-1] y = BOWTrain.iloc[:, -1] return X, y def gety(self, query): dataKey = 'Question' labelKey = 'y' corpusTestObj = Corpus() corpusTest = corpusTestObj.getExpandedSentences(query) BOWTest = self.BOWFit(corpusTest) X_test = BOWTest.iloc[:, :-1] y_test = BOWTest[labelKey] return X_test, y_test def runDNNTrain(self, corpus=None, learningRate=0.02, hiddenUnitSize=[128, 256, 128], dataKey='Question', labelKey='y'): if corpus is None: corpus = self.prepareDF(excelLocation=TRAIN_EXCEL) self.dnnObject = DNN(pd_df_train=corpus, pd_df_test=None, learning_rate=learningRate, hidden_units_size=hiddenUnitSize, dataKey=dataKey, labelKey=labelKey) result = self.dnnObject.run() return result def runSVM(self, corpus=None): X, y = self.getX(corpus) svm = SVMClassifier(X, y) clf = svm.train() #y_pred = clf.predict(X) #y_test_pred = clf.predict(X_test) #result1 = float(sum((y == y_pred) + 0)) / y_pred.shape[0] #result2 = float(sum((y_test == y_test_pred) + 0)) / y_test_pred.shape[0] # print 'SVM Training set accuracy: ', result1 # print 'SVM Test set accuracy: ', result2 return clf def runLR(self, corpus=None, query=None): X, y = self.getX(corpus) lr = LogisticRegClassifier(X, y) clf = lr.train() #y_pred = clf.predict(X) #y_test_pred = clf.predict(X_test) #result1 = float(sum((y == y_pred) + 0))/y_pred.shape[0] #result2 = float(sum((y_test == y_test_pred) + 0))/y_test_pred.shape[0] # print 'LR Training set accuracy: ', result1 # print 'LR Test set accuracy: ', result2 return clf def getAnswer(self, answer_index): answer = Corpus().loadData(TRAIN_EXCEL) return answer['Answer'][answer_index]