def changeLevelFactor(self, levelFactor): self.levelFactor = levelFactor # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) self.treeExtended = SearchTree(self.sentenceMatrixExtended) print('Chenged level to'+str(self.levelFactor))
def _initChatBot(self, folder, filenameClient, filenameAgent, filenameW2V, filenameSearchTree, filenameMessageMap, load_from_file, retrain_on_agent_file=False): # Load messages dataframe self.MessagesDataFrame = mdb.load_data_frame('MessagesDF.pkl', folder) fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) # Either load everything from files: if load_from_file: self.W2V = Word2Vec(filenameW2V, folder=folder, load_from_file=True) self.tree = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) # or generate it from scratch: else: # Train W2V model on client file self.W2V = Word2Vec(filenameClient, folder=folder, workers=7) # Retrain (optionally) on agent file if retrain_on_agent_file: self.W2V.retrain(filenameAgent, folder=folder) # Make the sentence vectorizer object, get sentence matrix # and generate search tree self.tree = SearchTree(self.matrix) self.vectorizer = SentenceVectorizer(self.W2V) self.matrix = self.vectorizer.get_sentences_matrix(filenameClient, folder=folder)
def load(self, folder = None, filenameW2V = None, filenameMatrix = None, filenameSearchTree = None, filenameMessageMap = None): if folder is None: folder = self.folder if filenameW2V is None: filenameW2V = self.filenameW2V if filenameMatrix is None: filenameMatrix = self.filenameMatrix if filenameSearchTree is None: filenameSearchTree = self.filenameSearchTree if filenameMessageMap is None: filenameMessageMap = self.filenameMessageMap # Load w2v object self.W2V = Word2Vec(filenameW2V, folder = folder, load_from_file = True) print('loaded W2V') # Load vectorizer self.vectorizer = SentenceVectorizer(self.W2V) print('loaded vectorizer') # Load search tree self.treeExtended = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) print('loaded tree') # Load sentence matrix and prepare submatrices fn = os.path.join(folder, filenameMatrix) with open(fn, 'rb') as fid: self.sentenceMatrix = pickle.load(fid) self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix) self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1)) self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) print('loaded matrix') # Load message map fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) print('loaded message map')
def load(self, folder=None, filenameW2V=None, filenameMatrix=None, filenameSearchTree=None, filenameMessageMap=None): if folder is None: folder = self.folder if filenameW2V is None: filenameW2V = self.filenameW2V if filenameMatrix is None: filenameMatrix = self.filenameMatrix if filenameSearchTree is None: filenameSearchTree = self.filenameSearchTree if filenameMessageMap is None: filenameMessageMap = self.filenameMessageMap # Load w2v object self.W2V = Word2Vec(filenameW2V, folder=folder, load_from_file=True) print('loaded W2V') # Load vectorizer self.vectorizer = SentenceVectorizer(self.W2V) print('loaded vectorizer') # Load search tree self.tree = SearchTree(sentences_matrix=None, filename=filenameSearchTree, folder=folder) print('loaded tree') # Load sentence matrix and prepare submatrices fn = os.path.join(folder, filenameMatrix) with open(fn, 'rb') as fid: self.sentenceMatrix = pickle.load(fid) self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from( self.sentenceMatrix) # Extend sentence matrix with one dimention equal to level print('loaded matrix') # Load message map fn = os.path.join(folder, filenameMessageMap) with open(fn, 'rb') as fid: self.FilteredToUnfilteredMessageMap = pickle.load(fid) print('loaded message map')
@author: piromast """ # In[1]: from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree import message_db.message_db as mdb folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data' filenameClient = '04_clientMessagesFiltered.txt' filenameAgent = '04_agentMessagesFiltered.txt' W2V = Word2Vec(filenameClient, folder=folder, workers=7) #W2V.retrain(filenameAgent, folder = folder) vectorizer = SentenceVectorizer(W2V) matrix = vectorizer.get_sentences_matrix(filenameClient, folder=folder) tree = SearchTree(matrix) # In[2]: import pandas as pd import numpy as np import os filenameIndexTable = os.path.join(folder, 'client_agent_summary.csv') indexTable = pd.read_csv(filenameIndexTable) linePositionInFilteredFile = indexTable['linePosFiltered'].values linePosition = indexTable['linePos'].values # Construct map from filtered to unfiltered file line number filteredToUnfiltedPositionMap = [ np.int(linePosition[i]) for i in linePosition.tolist()
def _initChatBot(self, folder, filenameClient, filenameAgent, filenameMessageDF, filenameMessageInfoTable, load_from_file, retrain_on_agent_file = True): # Load messages dataframe self.MessagesDataFrame = mdb.load_data_frame(filenameMessageDF, folder) # Import chat info table filenameIndexTable = os.path.join(folder,filenameMessageInfoTable) indexTable = pd.read_csv(filenameIndexTable) # Extract sentence level information and conversation number # (the order of each sentence in a conversation) self.sentenceLevel = indexTable['convPos'].values.reshape((-1,1)) self.conversationNumber = indexTable['convID'].values.reshape((-1,1)) linePositionInFilteredFile = indexTable['linePosFiltered'].values linePosition = indexTable['linePos'].values self.sentenceLevelNoNans = self.sentenceLevel[~np.isnan(linePositionInFilteredFile)] # Either load everything from files: if load_from_file: self.load() # or generate it from scratch: else: # Train W2V model on client file self.W2V = Word2Vec(filenameClient, folder = folder, workers = 7) # Retrain (optionally) on agent file if retrain_on_agent_file: self.W2V.retrain(filenameAgent, folder = folder, workers = 7) # Make the sentence vectorizer object, get sentence matrix # and generate search tree self.vectorizer = SentenceVectorizer(self.W2V) self.sentenceMatrix = self.vectorizer.get_sentences_matrix(filenameClient, folder = folder) # Construct map from filtered to unfiltered file line number self.FilteredToUnfilteredMessageMap = [np.int(linePosition[i]) for i in linePosition.tolist() if not np.isnan(linePositionInFilteredFile[i])] # Normalize sentenceMatrix self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix) self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1)) self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength # Extend sentence matrix with one dimention equal to level self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized, self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor)) # Calculate compound filtered to unfiltered message map self.FilteredToUnfilteredMessageMap = [self.FilteredToUnfilteredMessageMap[i] for i in self.correspondenceVector] self.treeExtended = SearchTree(self.sentenceMatrixExtended) # Instantiate input cleaner object self.cleaner = InputCleaner(folder)
def __get_score(self, train_index, test_index, size, window , min_count, workers): """ Private method doing the hard work described in test method """ for i, inputFile in enumerate(self.inputFileNames): print('Training iteration:',str(i)) sentenceGeneratorTrainSetInput = SentencesGeneratorInList(inputFile, train_index) if i == 0: # First train is done creating instance of Word2Vec W2VInput = gensim.models.Word2Vec(sentenceGeneratorTrainSetInput, size, window, min_count, workers) W2VInput = Word2Vec(W2VInput) else: # Subsequent training is done calling train method W2VInput.model.train(sentenceGeneratorTrainSetInput, size, window, min_count, workers) # Generate input and output sentence generators sentenceGeneratorTestSetInput = SentencesGeneratorInList(self.inputFileNames[0], test_index) sentenceGeneratorTestSetOutput = SentencesGeneratorInList(self.outputFileName, test_index) # Train input and output word2vec models W2VOutput = gensim.models.Word2Vec(sentenceGeneratorTestSetOutput, size, window, min_count, workers) W2VOutput = Word2Vec(W2VOutput) vectorizerInput = SentenceVectorizer(W2VInput) vectorizerOutput = SentenceVectorizer(W2VOutput) # calculate matrix of sentence vectors of whole text file matrixInput = vectorizerInput.get_sentences_matrix(sentenceGenerator = sentenceGeneratorTestSetInput) matrixInput,_ = remove_NaNs_from(matrixInput) matrixOutput = vectorizerOutput.get_sentences_matrix(sentenceGenerator = sentenceGeneratorTestSetOutput) matrixOutput,_ = remove_NaNs_from(matrixOutput) # Generate search tree (to find nearest vector) treeInput = SearchTree(matrixInput) #treeOutput = SearchTree(matrixOutput) n = 0 # Sentence number counter # Initialize different distance measures # (input and output eucledian distance and cos similarity) total_input_distance = 0 total_output_distance = 0 total_cosine_similarity = 0 for vIn, vOut in zip(matrixInput, matrixOutput): #print('Print:',str(i)) n += 1 # Find nearest vector to input in sentence corpus indexInput, distanceInput = treeInput.findNearestVector(vIn) # Accumulate input vector distance to nearest neighbout total_input_distance += distanceInput # Calculating output vector corresponding # to nearest input sentence vOutPredicted = matrixOutput[indexInput,] # Caluclating eucledian distance # between observed and predicted output distanceOutput = spd.euclidean(vOut, vOutPredicted) total_output_distance += distanceOutput # Calculating cosine similarity between observation and prediction cosineSimilarity = spd.cosine(vOut, vOutPredicted) total_cosine_similarity += cosineSimilarity return (total_cosine_similarity, total_input_distance, total_output_distance)
from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree, remove_NaNs_from import message_db.message_db as mdb import os from preprocessing.filtering import filter_line, build_names, NLTKPreprocessor folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data' #filenameAgent = '04_agentMessagesFilteredTrigramLemmatized.txt' filenameClient = '04_clientMessagesFilteredOld.txt' filenameAgent = '04_agentMessagesFilteredOld.txt' W2V = Word2Vec(filenameClient, folder = folder) vectorizer = SentenceVectorizer(W2V) sentenceMatrix = vectorizer.get_sentences_matrix(filenameClient, folder = folder) tree = SearchTree(sentenceMatrix) # In[2]: import pandas as pd import numpy as np import scipy as sp filenameIndexTable = os.path.join(folder,'client_agent_summaryOld.csv') indexTable = pd.read_csv(filenameIndexTable) sentenceLevel = indexTable['convPos'].values.reshape((-1,1)) conversationNumber = indexTable['convID'].values.reshape((-1,1)) linePositionInFilteredFile = indexTable['linePosFiltered'].values linePosition = indexTable['linePos'].values # Construct map from filtered to unfiltered file line number filteredToUnfiltedPositionMap = [np.int(linePosition[i]) for i in linePosition.tolist() if not np.isnan(linePositionInFilteredFile[i])]
# Add app folder to PYTHONPATH from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree import os.path filename = os.path.join("/Users", "piromast", "Dropbox", "S2DS - M&S", "Data", "04_clientMessagesFiltered.txt") W2V = Word2Vec(filename) # Train word2vec with file in filename vectorizer = SentenceVectorizer(W2V) # make sentence vectorizer object matrix = vectorizer.get_sentences_matrix( filename) # calculate matrix of sentence vectors of whole text file tree = SearchTree(matrix) # Generate search tree (to find nearest vector) # Lookup example v = vectorizer.get_sentence_vector( "I lost my order") # Convert sentence to vector index, distance = tree.findNearestVector(v)