def load(self, folder=None, filenameW2V=None, filenameMatrix=None, filenameSearchTree=None, filenameMessageMap=None): if folder is None: folder = self.folder if filenameW2V is None: filenameW2V = self.filenameW2V if filenameMatrix is None: filenameMatrix = self.filenameMatrix if filenameSearchTree is None: filenameSearchTree = self.filenameSearchTree if filenameMessageMap is None: filenameMessageMap = self.filenameMessageMap # Load w2v object self.W2V = Word2Vec(filenameW2V, folder=folder, load_from_file=True) print('loaded W2V') # Load vectorizer self.vectorizer = SentenceVectorizer(self.W2V) print('loaded vectorizer') # Load search tree self.tree = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) print('loaded tree') # Load sentence matrix and prepare submatrices fn = os.path.join(folder, filenameMatrix) with open(fn, 'rb') as fid: self.sentenceMatrix = pickle.load(fid) self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from( self.sentenceMatrix) # Extend sentence matrix with one dimention equal to level print('loaded matrix') # Load message map fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) print('loaded message map')
def load(self, folder = None, filenameW2V = None, filenameMatrix = None, filenameSearchTree = None, filenameMessageMap = None): if folder is None: folder = self.folder if filenameW2V is None: filenameW2V = self.filenameW2V if filenameMatrix is None: filenameMatrix = self.filenameMatrix if filenameSearchTree is None: filenameSearchTree = self.filenameSearchTree if filenameMessageMap is None: filenameMessageMap = self.filenameMessageMap # Load w2v object self.W2V = Word2Vec(filenameW2V, folder = folder, load_from_file = True) print('loaded W2V') # Load vectorizer self.vectorizer = SentenceVectorizer(self.W2V) print('loaded vectorizer') # Load search tree self.treeExtended = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) print('loaded tree') # Load sentence matrix and prepare submatrices fn = os.path.join(folder, filenameMatrix) with open(fn, 'rb') as fid: self.sentenceMatrix = pickle.load(fid) self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix) self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1)) self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) print('loaded matrix') # Load message map fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) print('loaded message map')
def _initChatBot(self, folder, filenameClient, filenameAgent, filenameW2V, filenameSearchTree, filenameMessageMap, load_from_file, retrain_on_agent_file=False): # Load messages dataframe self.MessagesDataFrame = mdb.load_data_frame('MessagesDF.pkl', folder) fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) # Either load everything from files: if load_from_file: self.W2V = Word2Vec(filenameW2V, folder=folder, load_from_file=True) self.tree = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) # or generate it from scratch: else: # Train W2V model on client file self.W2V = Word2Vec(filenameClient, folder=folder, workers=7) # Retrain (optionally) on agent file if retrain_on_agent_file: self.W2V.retrain(filenameAgent, folder=folder) # Make the sentence vectorizer object, get sentence matrix # and generate search tree self.tree = SearchTree(self.matrix) self.vectorizer = SentenceVectorizer(self.W2V) self.matrix = self.vectorizer.get_sentences_matrix(filenameClient, folder=folder)
Created on Fri Aug 19 16:52:19 2016 @author: piromast """ # In[1]: from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree import message_db.message_db as mdb folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data' filenameClient = '04_clientMessagesFiltered.txt' filenameAgent = '04_agentMessagesFiltered.txt' W2V = Word2Vec(filenameClient, folder=folder, workers=7) #W2V.retrain(filenameAgent, folder = folder) vectorizer = SentenceVectorizer(W2V) matrix = vectorizer.get_sentences_matrix(filenameClient, folder=folder) tree = SearchTree(matrix) # In[2]: import pandas as pd import numpy as np import os filenameIndexTable = os.path.join(folder, 'client_agent_summary.csv') indexTable = pd.read_csv(filenameIndexTable) linePositionInFilteredFile = indexTable['linePosFiltered'].values linePosition = indexTable['linePos'].values # Construct map from filtered to unfiltered file line number
def _initChatBot(self, folder, filenameClient, filenameAgent, filenameMessageDF, filenameMessageInfoTable, load_from_file, retrain_on_agent_file = True): # Load messages dataframe self.MessagesDataFrame = mdb.load_data_frame(filenameMessageDF, folder) # Import chat info table filenameIndexTable = os.path.join(folder,filenameMessageInfoTable) indexTable = pd.read_csv(filenameIndexTable) # Extract sentence level information and conversation number # (the order of each sentence in a conversation) self.sentenceLevel = indexTable['convPos'].values.reshape((-1,1)) self.conversationNumber = indexTable['convID'].values.reshape((-1,1)) linePositionInFilteredFile = indexTable['linePosFiltered'].values linePosition = indexTable['linePos'].values self.sentenceLevelNoNans = self.sentenceLevel[~np.isnan(linePositionInFilteredFile)] # Either load everything from files: if load_from_file: self.load() # or generate it from scratch: else: # Train W2V model on client file self.W2V = Word2Vec(filenameClient, folder = folder, workers = 7) # Retrain (optionally) on agent file if retrain_on_agent_file: self.W2V.retrain(filenameAgent, folder = folder, workers = 7) # Make the sentence vectorizer object, get sentence matrix # and generate search tree self.vectorizer = SentenceVectorizer(self.W2V) self.sentenceMatrix = self.vectorizer.get_sentences_matrix(filenameClient, folder = folder) # Construct map from filtered to unfiltered file line number self.FilteredToUnfilteredMessageMap = [np.int(linePosition[i]) for i in linePosition.tolist() if not np.isnan(linePositionInFilteredFile[i])] # Normalize sentenceMatrix self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix) self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1)) self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) # Calculate compound filtered to unfiltered message map self.FilteredToUnfilteredMessageMap = [self.FilteredToUnfilteredMessageMap[i] for i in self.correspondenceVector] self.treeExtended = SearchTree(self.sentenceMatrixExtended) # Instantiate input cleaner object self.cleaner = InputCleaner(folder)
class ChattyV2(object): def __init__(self, folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data', filenameClient = '05_clientMessagesFilteredfastText.txt', filenameAgent = '05_agentMessagesFilteredfastText.txt', filenameMessageDF = 'MessagesDF.pkl', filenameW2V = 'W2V_Chatty2_0.mdl', filenameSearchTree = 'SearchTree_Chatty2_0.pkl', filenameMessageMap = 'Filtered2UnfilteredMessageMap_Chatty2_0.pkl', filenameMessageInfoTable = 'client_agent_summary3.csv', filenameMatrix = 'SentenceMatrix_Chatty2_0.pkl', load_from_file = True, levelFactor = 0.35): # Init properties self.levelFactor = levelFactor self.conversation_level = 0 self.name = 'Chatty 2.0' self.folder = folder self.filenameClient = filenameClient self.filenameAgent = filenameAgent self.filenameMessageDF = filenameMessageDF self.filenameW2V = filenameW2V self.filenameSearchTree = filenameSearchTree self.filenameMessageMap = filenameMessageMap self.filenameMessageInfoTable = filenameMessageInfoTable self.filenameMatrix = filenameMatrix # Init bot self._initChatBot(folder, filenameClient, filenameAgent, filenameMessageDF, filenameMessageInfoTable, load_from_file) def _initChatBot(self, folder, filenameClient, filenameAgent, filenameMessageDF, filenameMessageInfoTable, load_from_file, retrain_on_agent_file = True): # Load messages dataframe self.MessagesDataFrame = mdb.load_data_frame(filenameMessageDF, folder) # Import chat info table filenameIndexTable = os.path.join(folder,filenameMessageInfoTable) indexTable = pd.read_csv(filenameIndexTable) # Extract sentence level information and conversation number # (the order of each sentence in a conversation) self.sentenceLevel = indexTable['convPos'].values.reshape((-1,1)) self.conversationNumber = indexTable['convID'].values.reshape((-1,1)) linePositionInFilteredFile = indexTable['linePosFiltered'].values linePosition = indexTable['linePos'].values self.sentenceLevelNoNans = self.sentenceLevel[~np.isnan(linePositionInFilteredFile)] # Either load everything from files: if load_from_file: self.load() # or generate it from scratch: else: # Train W2V model on client file self.W2V = Word2Vec(filenameClient, folder = folder, workers = 7) # Retrain (optionally) on agent file if retrain_on_agent_file: self.W2V.retrain(filenameAgent, folder = folder, workers = 7) # Make the sentence vectorizer object, get sentence matrix # and generate search tree self.vectorizer = SentenceVectorizer(self.W2V) self.sentenceMatrix = self.vectorizer.get_sentences_matrix(filenameClient, folder = folder) # Construct map from filtered to unfiltered file line number self.FilteredToUnfilteredMessageMap = [np.int(linePosition[i]) for i in linePosition.tolist() if not np.isnan(linePositionInFilteredFile[i])] # Normalize sentenceMatrix self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix) self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1)) self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) # Calculate compound filtered to unfiltered message map self.FilteredToUnfilteredMessageMap = [self.FilteredToUnfilteredMessageMap[i] for i in self.correspondenceVector] self.treeExtended = SearchTree(self.sentenceMatrixExtended) # Instantiate input cleaner object self.cleaner = InputCleaner(folder) def getReplyFromChatBot(self, input_message): try: # Clean message input_message = self.cleaner.clean_input(input_message) # Vectorize message vect = self._getSentenceVector(input_message, self.conversation_level) self.conversation_level += 1 # Find nearest message in client corpus ind, _ = self.treeExtended.findNearestVector(vect) print(ind) # Unmap to account for filter-removed messages ind = self.FilteredToUnfilteredMessageMap[ind] # Get closest client and agent message df = self.MessagesDataFrame ClientMessage = mdb.get_client_message_from_dataframe(df, ind) AgentMessage = mdb.get_agent_message_from_dataframe(df, ind) return (AgentMessage, ClientMessage) except Exception as inst: print(type(inst)) # the exception instance print(inst.args) # arguments stored in .args print(inst) # __str__ allows args to be printed directly def resetChat(self): self.conversation_level = 0 def save(self, folder = None, filenameW2V = None, filenameMatrix = None, filenameSearchTree = None, filenameMessageMap = None): if folder is None: folder = self.folder if filenameW2V is None: filenameW2V = self.filenameW2V if filenameMatrix is None: filenameMatrix = self.filenameMatrix if filenameSearchTree is None: filenameSearchTree = self.filenameSearchTree if filenameMessageMap is None: filenameMessageMap = self.filenameMessageMap self.W2V.save(filenameW2V, folder) print('saved W2V') self.treeExtended.save_tree(filenameSearchTree, folder) print('saved tree') with open(os.path.join(folder,filenameMessageMap),'wb') as f: pickle.dump(self.FilteredToUnfilteredMessageMap, f, protocol=2) print('saved message map') with open(os.path.join(folder,filenameMatrix),'wb') as f: pickle.dump(self.sentenceMatrix, f, protocol=2) print('saved matrix') def load(self, folder = None, filenameW2V = None, filenameMatrix = None, filenameSearchTree = None, filenameMessageMap = None): if folder is None: folder = self.folder if filenameW2V is None: filenameW2V = self.filenameW2V if filenameMatrix is None: filenameMatrix = self.filenameMatrix if filenameSearchTree is None: filenameSearchTree = self.filenameSearchTree if filenameMessageMap is None: filenameMessageMap = self.filenameMessageMap # Load w2v object self.W2V = Word2Vec(filenameW2V, folder = folder, load_from_file = True) print('loaded W2V') # Load vectorizer self.vectorizer = SentenceVectorizer(self.W2V) print('loaded vectorizer') # Load search tree self.treeExtended = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) print('loaded tree') # Load sentence matrix and prepare submatrices fn = os.path.join(folder, filenameMatrix) with open(fn, 'rb') as fid: self.sentenceMatrix = pickle.load(fid) self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix) self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1)) self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) print('loaded matrix') # Load message map fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) print('loaded message map') def changeLevelFactor(self, levelFactor): self.levelFactor = levelFactor # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) self.treeExtended = SearchTree(self.sentenceMatrixExtended) print('Chenged level to'+str(self.levelFactor)) def _getSentenceVector(self, text, level): vect = self.vectorizer.get_sentence_vector(text)/self.MaximumVectorLength vect = np.hstack((vect,level*self.levelFactor)) return vect
def __get_score(self, train_index, test_index, size, window , min_count, workers): """ Private method doing the hard work described in test method """ for i, inputFile in enumerate(self.inputFileNames): print('Training iteration:',str(i)) sentenceGeneratorTrainSetInput = SentencesGeneratorInList(inputFile, train_index) if i == 0: # First train is done creating instance of Word2Vec W2VInput = gensim.models.Word2Vec(sentenceGeneratorTrainSetInput, size, window, min_count, workers) W2VInput = Word2Vec(W2VInput) else: # Subsequent training is done calling train method W2VInput.model.train(sentenceGeneratorTrainSetInput, size, window, min_count, workers) # Generate input and output sentence generators sentenceGeneratorTestSetInput = SentencesGeneratorInList(self.inputFileNames[0], test_index) sentenceGeneratorTestSetOutput = SentencesGeneratorInList(self.outputFileName, test_index) # Train input and output word2vec models W2VOutput = gensim.models.Word2Vec(sentenceGeneratorTestSetOutput, size, window, min_count, workers) W2VOutput = Word2Vec(W2VOutput) vectorizerInput = SentenceVectorizer(W2VInput) vectorizerOutput = SentenceVectorizer(W2VOutput) # calculate matrix of sentence vectors of whole text file matrixInput = vectorizerInput.get_sentences_matrix(sentenceGenerator = sentenceGeneratorTestSetInput) matrixInput,_ = remove_NaNs_from(matrixInput) matrixOutput = vectorizerOutput.get_sentences_matrix(sentenceGenerator = sentenceGeneratorTestSetOutput) matrixOutput,_ = remove_NaNs_from(matrixOutput) # Generate search tree (to find nearest vector) treeInput = SearchTree(matrixInput) #treeOutput = SearchTree(matrixOutput) n = 0 # Sentence number counter # Initialize different distance measures # (input and output eucledian distance and cos similarity) total_input_distance = 0 total_output_distance = 0 total_cosine_similarity = 0 for vIn, vOut in zip(matrixInput, matrixOutput): #print('Print:',str(i)) n += 1 # Find nearest vector to input in sentence corpus indexInput, distanceInput = treeInput.findNearestVector(vIn) # Accumulate input vector distance to nearest neighbout total_input_distance += distanceInput # Calculating output vector corresponding # to nearest input sentence vOutPredicted = matrixOutput[indexInput,] # Caluclating eucledian distance # between observed and predicted output distanceOutput = spd.euclidean(vOut, vOutPredicted) total_output_distance += distanceOutput # Calculating cosine similarity between observation and prediction cosineSimilarity = spd.cosine(vOut, vOutPredicted) total_cosine_similarity += cosineSimilarity return (total_cosine_similarity, total_input_distance, total_output_distance)
@author: piromast """ # In[1]: from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree, remove_NaNs_from import message_db.message_db as mdb import os from preprocessing.filtering import filter_line, build_names, NLTKPreprocessor folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data' #filenameAgent = '04_agentMessagesFilteredTrigramLemmatized.txt' filenameClient = '04_clientMessagesFilteredOld.txt' filenameAgent = '04_agentMessagesFilteredOld.txt' W2V = Word2Vec(filenameClient, folder = folder) vectorizer = SentenceVectorizer(W2V) sentenceMatrix = vectorizer.get_sentences_matrix(filenameClient, folder = folder) tree = SearchTree(sentenceMatrix) # In[2]: import pandas as pd import numpy as np import scipy as sp filenameIndexTable = os.path.join(folder,'client_agent_summaryOld.csv') indexTable = pd.read_csv(filenameIndexTable) sentenceLevel = indexTable['convPos'].values.reshape((-1,1)) conversationNumber = indexTable['convID'].values.reshape((-1,1)) linePositionInFilteredFile = indexTable['linePosFiltered'].values
class ChattyV1(object): def __init__( self, folder='C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data', filenameClient='04_clientMessagesFiltered.txt', filenameAgent='04_agentMessagesFiltered.txt', filenameMessageDF='MessagesDF.pkl', filenameW2V='W2V_Chatty1_0.mdl', filenameSearchTree='SearchTree_Chatty1_0.pkl', filenameMessageMap='Filtered2UnfilteredMessageMap_Chatty1_0.pkl', load_from_file=True): # Init bot self._initChatBot(folder, filenameClient, filenameAgent, filenameW2V, filenameSearchTree, filenameMessageMap, load_from_file) def _initChatBot(self, folder, filenameClient, filenameAgent, filenameW2V, filenameSearchTree, filenameMessageMap, load_from_file, retrain_on_agent_file=False): # Load messages dataframe self.MessagesDataFrame = mdb.load_data_frame('MessagesDF.pkl', folder) fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) # Either load everything from files: if load_from_file: self.W2V = Word2Vec(filenameW2V, folder=folder, load_from_file=True) self.tree = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) # or generate it from scratch: else: # Train W2V model on client file self.W2V = Word2Vec(filenameClient, folder=folder, workers=7) # Retrain (optionally) on agent file if retrain_on_agent_file: self.W2V.retrain(filenameAgent, folder=folder) # Make the sentence vectorizer object, get sentence matrix # and generate search tree self.tree = SearchTree(self.matrix) self.vectorizer = SentenceVectorizer(self.W2V) self.matrix = self.vectorizer.get_sentences_matrix(filenameClient, folder=folder) def getReplyFromChatBot(self, input_message): try: # Vectorize message vect = self.vectorizer.get_sentence_vector(input_message) print(vect) # Find nearest message in client corpus ind, _ = self.tree.findNearestVector(vect) print(ind) # Unmap to account for filter-removed messages ind = self.FilteredToUnfilteredMessageMap[ind] # Get closest client and agent message df = self.MessagesDataFrame ClientMessage = mdb.get_client_message_from_dataframe(df, ind) AgentMessage = mdb.get_agent_message_from_dataframe(df, ind) return (AgentMessage, ClientMessage) except Exception as inst: print type(inst) # the exception instance print inst.args # arguments stored in .args print inst # __str__ allows args to be printed directly
# Add app folder to PYTHONPATH from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree import os.path filename = os.path.join("/Users", "piromast", "Dropbox", "S2DS - M&S", "Data", "04_clientMessagesFiltered.txt") W2V = Word2Vec(filename) # Train word2vec with file in filename vectorizer = SentenceVectorizer(W2V) # make sentence vectorizer object matrix = vectorizer.get_sentences_matrix( filename) # calculate matrix of sentence vectors of whole text file tree = SearchTree(matrix) # Generate search tree (to find nearest vector) # Lookup example v = vectorizer.get_sentence_vector( "I lost my order") # Convert sentence to vector index, distance = tree.findNearestVector(v)