示例#1
0
    def load(self,
             folder=None,
             filenameW2V=None,
             filenameMatrix=None,
             filenameSearchTree=None,
             filenameMessageMap=None):
        if folder is None:
            folder = self.folder
        if filenameW2V is None:
            filenameW2V = self.filenameW2V
        if filenameMatrix is None:
            filenameMatrix = self.filenameMatrix
        if filenameSearchTree is None:
            filenameSearchTree = self.filenameSearchTree
        if filenameMessageMap is None:
            filenameMessageMap = self.filenameMessageMap
        # Load w2v object

        self.W2V = Word2Vec(filenameW2V, folder=folder, load_from_file=True)
        print('loaded W2V')

        # Load vectorizer

        self.vectorizer = SentenceVectorizer(self.W2V)
        print('loaded vectorizer')

        # Load search tree
        self.tree = SearchTree(sentences_matrix=None,
                               filename=filenameSearchTree,
                               folder=folder)
        print('loaded tree')

        # Load sentence matrix and prepare submatrices
        fn = os.path.join(folder, filenameMatrix)
        with open(fn, 'rb') as fid:
            self.sentenceMatrix = pickle.load(fid)
            self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(
                self.sentenceMatrix)
            # Extend sentence matrix with one dimention equal to level

            print('loaded matrix')
        # Load message map
        fn = os.path.join(folder, filenameMessageMap)
        with open(fn, 'rb') as fid:
            self.FilteredToUnfilteredMessageMap = pickle.load(fid)
            print('loaded message map')
示例#2
0
    def load(self, folder = None, filenameW2V = None, filenameMatrix = None, 
             filenameSearchTree = None, filenameMessageMap = None):
        if folder is None:
            folder = self.folder
        if filenameW2V is None:
            filenameW2V = self.filenameW2V
        if filenameMatrix is None:
             filenameMatrix = self.filenameMatrix
        if filenameSearchTree is None:
             filenameSearchTree = self.filenameSearchTree
        if filenameMessageMap is None:
            filenameMessageMap = self.filenameMessageMap
        # Load w2v object


        self.W2V = Word2Vec(filenameW2V, folder = folder, 
                                load_from_file = True)
        print('loaded W2V')
        
        # Load vectorizer         

        self.vectorizer = SentenceVectorizer(self.W2V)
        print('loaded vectorizer')
        
        # Load search tree
        self.treeExtended = SearchTree(sentences_matrix=None, 
                                   filename=filenameSearchTree, 
                                   folder=folder)
        print('loaded tree')
        
        # Load sentence matrix and prepare submatrices                           
        fn = os.path.join(folder, filenameMatrix)
        with open(fn, 'rb') as fid:                       
            self.sentenceMatrix = pickle.load(fid)
            self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix)
            self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1))
            self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength   
            # Extend sentence matrix with one dimention equal to level
            self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized,
                                              self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor))
            print('loaded matrix')
        # Load message map
        fn = os.path.join(folder, filenameMessageMap)
        with open(fn, 'rb') as fid:                       
            self.FilteredToUnfilteredMessageMap = pickle.load(fid)
            print('loaded message map')
示例#3
0
    def _initChatBot(self,
                     folder,
                     filenameClient,
                     filenameAgent,
                     filenameW2V,
                     filenameSearchTree,
                     filenameMessageMap,
                     load_from_file,
                     retrain_on_agent_file=False):

        # Load messages dataframe
        self.MessagesDataFrame = mdb.load_data_frame('MessagesDF.pkl', folder)
        fn = os.path.join(folder, filenameMessageMap)
        with open(fn, 'rb') as fid:
            self.FilteredToUnfilteredMessageMap = pickle.load(fid)

        # Either load everything from files:
        if load_from_file:
            self.W2V = Word2Vec(filenameW2V,
                                folder=folder,
                                load_from_file=True)

            self.tree = SearchTree(sentences_matrix=None,
                                   filename=filenameSearchTree,
                                   folder=folder)

        # or generate it from scratch:
        else:
            # Train W2V model on client file
            self.W2V = Word2Vec(filenameClient, folder=folder, workers=7)

            # Retrain (optionally) on agent file
            if retrain_on_agent_file:
                self.W2V.retrain(filenameAgent, folder=folder)

            # Make the sentence vectorizer object, get sentence matrix
            # and generate search tree

            self.tree = SearchTree(self.matrix)

        self.vectorizer = SentenceVectorizer(self.W2V)
        self.matrix = self.vectorizer.get_sentences_matrix(filenameClient,
                                                           folder=folder)
示例#4
0
Created on Fri Aug 19 16:52:19 2016

@author: piromast
"""
# In[1]:
from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree
import message_db.message_db as mdb

folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data'
filenameClient = '04_clientMessagesFiltered.txt'
filenameAgent = '04_agentMessagesFiltered.txt'
W2V = Word2Vec(filenameClient, folder=folder, workers=7)

#W2V.retrain(filenameAgent, folder = folder)

vectorizer = SentenceVectorizer(W2V)
matrix = vectorizer.get_sentences_matrix(filenameClient, folder=folder)
tree = SearchTree(matrix)

# In[2]:
import pandas as pd
import numpy as np
import os

filenameIndexTable = os.path.join(folder, 'client_agent_summary.csv')

indexTable = pd.read_csv(filenameIndexTable)

linePositionInFilteredFile = indexTable['linePosFiltered'].values
linePosition = indexTable['linePos'].values
# Construct map from filtered to unfiltered file line number
示例#5
0
    def _initChatBot(self, folder, filenameClient, filenameAgent, filenameMessageDF,
                     filenameMessageInfoTable, load_from_file, retrain_on_agent_file = True):

        # Load messages dataframe
        self.MessagesDataFrame = mdb.load_data_frame(filenameMessageDF, folder) 
            
        # Import chat info table
        filenameIndexTable = os.path.join(folder,filenameMessageInfoTable)
        indexTable = pd.read_csv(filenameIndexTable)
        # Extract sentence level information and conversation number
        # (the order of each sentence in a conversation)
        self.sentenceLevel = indexTable['convPos'].values.reshape((-1,1))
        self.conversationNumber = indexTable['convID'].values.reshape((-1,1))
        linePositionInFilteredFile = indexTable['linePosFiltered'].values
        linePosition = indexTable['linePos'].values
        self.sentenceLevelNoNans = self.sentenceLevel[~np.isnan(linePositionInFilteredFile)]
        
        # Either load everything from files:
        if load_from_file:
            self.load()
        
        # or generate it from scratch:
        else:
            # Train W2V model on client file
            self.W2V = Word2Vec(filenameClient, folder = folder, workers = 7)
            
            # Retrain (optionally) on agent file
            if retrain_on_agent_file:
                self.W2V.retrain(filenameAgent, folder = folder, workers = 7)
            
            # Make the sentence vectorizer object, get sentence matrix
            # and generate search tree
            self.vectorizer = SentenceVectorizer(self.W2V)
            self.sentenceMatrix = self.vectorizer.get_sentences_matrix(filenameClient, 
                                                               folder = folder)
            

            

            
            # Construct map from filtered to unfiltered file line number 
            self.FilteredToUnfilteredMessageMap = [np.int(linePosition[i]) 
            for i in linePosition.tolist() if not np.isnan(linePositionInFilteredFile[i])]
            
        
            # Normalize sentenceMatrix
            self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix)
            self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1))
            self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength

            # Extend sentence matrix with one dimention equal to level
            self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized,
                                              self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor))
            
            # Calculate compound filtered to unfiltered message map
            self.FilteredToUnfilteredMessageMap = [self.FilteredToUnfilteredMessageMap[i] for i in self.correspondenceVector]
            


            self.treeExtended = SearchTree(self.sentenceMatrixExtended)

        # Instantiate input cleaner object 
        self.cleaner = InputCleaner(folder)
示例#6
0
class ChattyV2(object):
    def __init__(self, folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data', 
                 filenameClient = '05_clientMessagesFilteredfastText.txt', 
                 filenameAgent = '05_agentMessagesFilteredfastText.txt',
                 filenameMessageDF = 'MessagesDF.pkl',
                 filenameW2V = 'W2V_Chatty2_0.mdl', 
                 filenameSearchTree = 'SearchTree_Chatty2_0.pkl',
                 filenameMessageMap = 'Filtered2UnfilteredMessageMap_Chatty2_0.pkl',
                 filenameMessageInfoTable = 'client_agent_summary3.csv',
                 filenameMatrix = 'SentenceMatrix_Chatty2_0.pkl',
                 load_from_file = True, levelFactor = 0.35):
        
        # Init properties
        self.levelFactor = levelFactor
        self.conversation_level = 0
        self.name = 'Chatty 2.0'
        self.folder = folder
        self.filenameClient = filenameClient
        self.filenameAgent = filenameAgent
        self.filenameMessageDF = filenameMessageDF
        self.filenameW2V = filenameW2V
        self.filenameSearchTree = filenameSearchTree
        self.filenameMessageMap = filenameMessageMap
        self.filenameMessageInfoTable = filenameMessageInfoTable
        self.filenameMatrix = filenameMatrix
        
        # Init bot        
        self._initChatBot(folder, filenameClient, filenameAgent, 
                          filenameMessageDF, filenameMessageInfoTable, load_from_file)
            
    def _initChatBot(self, folder, filenameClient, filenameAgent, filenameMessageDF,
                     filenameMessageInfoTable, load_from_file, retrain_on_agent_file = True):

        # Load messages dataframe
        self.MessagesDataFrame = mdb.load_data_frame(filenameMessageDF, folder) 
            
        # Import chat info table
        filenameIndexTable = os.path.join(folder,filenameMessageInfoTable)
        indexTable = pd.read_csv(filenameIndexTable)
        # Extract sentence level information and conversation number
        # (the order of each sentence in a conversation)
        self.sentenceLevel = indexTable['convPos'].values.reshape((-1,1))
        self.conversationNumber = indexTable['convID'].values.reshape((-1,1))
        linePositionInFilteredFile = indexTable['linePosFiltered'].values
        linePosition = indexTable['linePos'].values
        self.sentenceLevelNoNans = self.sentenceLevel[~np.isnan(linePositionInFilteredFile)]
        
        # Either load everything from files:
        if load_from_file:
            self.load()
        
        # or generate it from scratch:
        else:
            # Train W2V model on client file
            self.W2V = Word2Vec(filenameClient, folder = folder, workers = 7)
            
            # Retrain (optionally) on agent file
            if retrain_on_agent_file:
                self.W2V.retrain(filenameAgent, folder = folder, workers = 7)
            
            # Make the sentence vectorizer object, get sentence matrix
            # and generate search tree
            self.vectorizer = SentenceVectorizer(self.W2V)
            self.sentenceMatrix = self.vectorizer.get_sentences_matrix(filenameClient, 
                                                               folder = folder)
            

            

            
            # Construct map from filtered to unfiltered file line number 
            self.FilteredToUnfilteredMessageMap = [np.int(linePosition[i]) 
            for i in linePosition.tolist() if not np.isnan(linePositionInFilteredFile[i])]
            
        
            # Normalize sentenceMatrix
            self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix)
            self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1))
            self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength

            # Extend sentence matrix with one dimention equal to level
            self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized,
                                              self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor))
            
            # Calculate compound filtered to unfiltered message map
            self.FilteredToUnfilteredMessageMap = [self.FilteredToUnfilteredMessageMap[i] for i in self.correspondenceVector]
            


            self.treeExtended = SearchTree(self.sentenceMatrixExtended)

        # Instantiate input cleaner object 
        self.cleaner = InputCleaner(folder)
        
    def getReplyFromChatBot(self, input_message):
        try:
            # Clean message
            input_message = self.cleaner.clean_input(input_message)
            
            # Vectorize message
            vect = self._getSentenceVector(input_message, self.conversation_level)
            self.conversation_level += 1
            
            # Find nearest message in client corpus
            ind, _ = self.treeExtended.findNearestVector(vect)
            print(ind)
            
            # Unmap to account for filter-removed messages
            ind = self.FilteredToUnfilteredMessageMap[ind]
    
            # Get closest client and agent message
            df = self.MessagesDataFrame
            ClientMessage = mdb.get_client_message_from_dataframe(df, ind)
            AgentMessage = mdb.get_agent_message_from_dataframe(df, ind)
            return (AgentMessage, ClientMessage)
            
        except Exception as inst:
            print(type(inst))    # the exception instance
            print(inst.args)     # arguments stored in .args
            print(inst)           # __str__ allows args to be printed directly

    def resetChat(self):
        self.conversation_level = 0
    
    
    def save(self, folder = None, filenameW2V = None, filenameMatrix = None, 
             filenameSearchTree = None, filenameMessageMap = None):
        if folder is None:
            folder = self.folder
        if filenameW2V is None:
            filenameW2V = self.filenameW2V
        if filenameMatrix is None:
             filenameMatrix = self.filenameMatrix
        if filenameSearchTree is None:
             filenameSearchTree = self.filenameSearchTree
        if filenameMessageMap is None:
            filenameMessageMap = self.filenameMessageMap

        self.W2V.save(filenameW2V, folder)
        print('saved W2V')
        self.treeExtended.save_tree(filenameSearchTree, folder)
        print('saved tree')

        with open(os.path.join(folder,filenameMessageMap),'wb') as f:
            pickle.dump(self.FilteredToUnfilteredMessageMap, f, protocol=2)
        print('saved message map')

        with open(os.path.join(folder,filenameMatrix),'wb') as f:
            pickle.dump(self.sentenceMatrix, f, protocol=2)   
        print('saved matrix')

    def load(self, folder = None, filenameW2V = None, filenameMatrix = None, 
             filenameSearchTree = None, filenameMessageMap = None):
        if folder is None:
            folder = self.folder
        if filenameW2V is None:
            filenameW2V = self.filenameW2V
        if filenameMatrix is None:
             filenameMatrix = self.filenameMatrix
        if filenameSearchTree is None:
             filenameSearchTree = self.filenameSearchTree
        if filenameMessageMap is None:
            filenameMessageMap = self.filenameMessageMap
        # Load w2v object


        self.W2V = Word2Vec(filenameW2V, folder = folder, 
                                load_from_file = True)
        print('loaded W2V')
        
        # Load vectorizer         

        self.vectorizer = SentenceVectorizer(self.W2V)
        print('loaded vectorizer')
        
        # Load search tree
        self.treeExtended = SearchTree(sentences_matrix=None, 
                                   filename=filenameSearchTree, 
                                   folder=folder)
        print('loaded tree')
        
        # Load sentence matrix and prepare submatrices                           
        fn = os.path.join(folder, filenameMatrix)
        with open(fn, 'rb') as fid:                       
            self.sentenceMatrix = pickle.load(fid)
            self.sentenceMatrixNoNans, self.correspondenceVector = remove_NaNs_from(self.sentenceMatrix)
            self.MaximumVectorLength = np.max(sp.linalg.norm(self.sentenceMatrixNoNans,ord=2,axis=1))
            self.sentenceMatrixNormalized = self.sentenceMatrixNoNans/self.MaximumVectorLength   
            # Extend sentence matrix with one dimention equal to level
            self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized,
                                              self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor))
            print('loaded matrix')
        # Load message map
        fn = os.path.join(folder, filenameMessageMap)
        with open(fn, 'rb') as fid:                       
            self.FilteredToUnfilteredMessageMap = pickle.load(fid)
            print('loaded message map')
    
    def changeLevelFactor(self, levelFactor):
        self.levelFactor = levelFactor
        # Extend sentence matrix with one dimention equal to level
        self.sentenceMatrixExtended=np.hstack((self.sentenceMatrixNormalized,
                                              self.sentenceLevelNoNans[self.correspondenceVector]*self.levelFactor))

        self.treeExtended = SearchTree(self.sentenceMatrixExtended)        
        print('Chenged level to'+str(self.levelFactor))
        
    def _getSentenceVector(self, text, level):
        vect = self.vectorizer.get_sentence_vector(text)/self.MaximumVectorLength
        vect = np.hstack((vect,level*self.levelFactor))
        return vect
示例#7
0
    def __get_score(self, train_index, test_index, size, window , min_count, workers):
        """
        Private method doing the hard work described in test method
        """
        for i, inputFile in enumerate(self.inputFileNames):
            print('Training iteration:',str(i))
            sentenceGeneratorTrainSetInput = SentencesGeneratorInList(inputFile, train_index)
            if i == 0:  # First train is done creating instance of Word2Vec 
                
                W2VInput = gensim.models.Word2Vec(sentenceGeneratorTrainSetInput, size, window, min_count, workers)
                W2VInput = Word2Vec(W2VInput)
            else:       # Subsequent training is done calling train method
                W2VInput.model.train(sentenceGeneratorTrainSetInput, size, window, min_count, workers)
        
        
        # Generate input and output sentence generators
        sentenceGeneratorTestSetInput = SentencesGeneratorInList(self.inputFileNames[0], test_index)
        sentenceGeneratorTestSetOutput = SentencesGeneratorInList(self.outputFileName, test_index)
        
        # Train input and output word2vec models
        W2VOutput = gensim.models.Word2Vec(sentenceGeneratorTestSetOutput, 
                                           size, window, min_count, workers)
        W2VOutput = Word2Vec(W2VOutput)
        vectorizerInput = SentenceVectorizer(W2VInput)
        vectorizerOutput = SentenceVectorizer(W2VOutput)
        
        # calculate matrix of sentence vectors of whole text file
        matrixInput = vectorizerInput.get_sentences_matrix(sentenceGenerator = 
        sentenceGeneratorTestSetInput)

        matrixInput,_ = remove_NaNs_from(matrixInput)        
        
        matrixOutput = vectorizerOutput.get_sentences_matrix(sentenceGenerator = 
        sentenceGeneratorTestSetOutput)
        matrixOutput,_ = remove_NaNs_from(matrixOutput)         
        
        # Generate search tree (to find nearest vector)
        treeInput = SearchTree(matrixInput)   
        #treeOutput = SearchTree(matrixOutput)   
        
        n = 0 # Sentence number counter     
        
        # Initialize different distance measures 
        # (input and output eucledian distance and cos similarity)
        total_input_distance = 0  
        total_output_distance = 0
        total_cosine_similarity = 0
        
        for vIn, vOut in zip(matrixInput, matrixOutput):
            #print('Print:',str(i))
            n += 1
            # Find nearest vector to input in sentence corpus
            indexInput, distanceInput = treeInput.findNearestVector(vIn) 
            # Accumulate input vector distance to nearest neighbout
            total_input_distance += distanceInput
            
            # Calculating output vector corresponding 
            # to nearest input sentence
            vOutPredicted = matrixOutput[indexInput,]
            
            # Caluclating eucledian distance 
            # between observed and predicted output 
            distanceOutput = spd.euclidean(vOut, vOutPredicted)
            total_output_distance += distanceOutput 
            
            # Calculating cosine similarity between observation and prediction
            cosineSimilarity = spd.cosine(vOut, vOutPredicted)
            total_cosine_similarity += cosineSimilarity
            
        return (total_cosine_similarity, 
                total_input_distance, total_output_distance)
示例#8
0
@author: piromast
"""
# In[1]:
from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree, remove_NaNs_from
import message_db.message_db as mdb
import os
from preprocessing.filtering import filter_line, build_names, NLTKPreprocessor

folder = 'C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data'
#filenameAgent = '04_agentMessagesFilteredTrigramLemmatized.txt'
filenameClient = '04_clientMessagesFilteredOld.txt'
filenameAgent = '04_agentMessagesFilteredOld.txt'

W2V = Word2Vec(filenameClient, folder = folder)

vectorizer = SentenceVectorizer(W2V)
sentenceMatrix = vectorizer.get_sentences_matrix(filenameClient, folder = folder)

tree = SearchTree(sentenceMatrix)

# In[2]:
import pandas as pd
import numpy as np
import scipy as sp

filenameIndexTable = os.path.join(folder,'client_agent_summaryOld.csv')

indexTable = pd.read_csv(filenameIndexTable)
sentenceLevel = indexTable['convPos'].values.reshape((-1,1))
conversationNumber = indexTable['convID'].values.reshape((-1,1))
linePositionInFilteredFile = indexTable['linePosFiltered'].values
示例#9
0
class ChattyV1(object):
    def __init__(
            self,
            folder='C:\\Users\\Client\\Dropbox\\S2DS - M&S\\Data',
            filenameClient='04_clientMessagesFiltered.txt',
            filenameAgent='04_agentMessagesFiltered.txt',
            filenameMessageDF='MessagesDF.pkl',
            filenameW2V='W2V_Chatty1_0.mdl',
            filenameSearchTree='SearchTree_Chatty1_0.pkl',
            filenameMessageMap='Filtered2UnfilteredMessageMap_Chatty1_0.pkl',
            load_from_file=True):

        # Init bot
        self._initChatBot(folder, filenameClient, filenameAgent, filenameW2V,
                          filenameSearchTree, filenameMessageMap,
                          load_from_file)

    def _initChatBot(self,
                     folder,
                     filenameClient,
                     filenameAgent,
                     filenameW2V,
                     filenameSearchTree,
                     filenameMessageMap,
                     load_from_file,
                     retrain_on_agent_file=False):

        # Load messages dataframe
        self.MessagesDataFrame = mdb.load_data_frame('MessagesDF.pkl', folder)
        fn = os.path.join(folder, filenameMessageMap)
        with open(fn, 'rb') as fid:
            self.FilteredToUnfilteredMessageMap = pickle.load(fid)

        # Either load everything from files:
        if load_from_file:
            self.W2V = Word2Vec(filenameW2V,
                                folder=folder,
                                load_from_file=True)

            self.tree = SearchTree(sentences_matrix=None,
                                   filename=filenameSearchTree,
                                   folder=folder)

        # or generate it from scratch:
        else:
            # Train W2V model on client file
            self.W2V = Word2Vec(filenameClient, folder=folder, workers=7)

            # Retrain (optionally) on agent file
            if retrain_on_agent_file:
                self.W2V.retrain(filenameAgent, folder=folder)

            # Make the sentence vectorizer object, get sentence matrix
            # and generate search tree

            self.tree = SearchTree(self.matrix)

        self.vectorizer = SentenceVectorizer(self.W2V)
        self.matrix = self.vectorizer.get_sentences_matrix(filenameClient,
                                                           folder=folder)

    def getReplyFromChatBot(self, input_message):
        try:
            # Vectorize message
            vect = self.vectorizer.get_sentence_vector(input_message)
            print(vect)
            # Find nearest message in client corpus
            ind, _ = self.tree.findNearestVector(vect)
            print(ind)
            # Unmap to account for filter-removed messages
            ind = self.FilteredToUnfilteredMessageMap[ind]

            # Get closest client and agent message
            df = self.MessagesDataFrame
            ClientMessage = mdb.get_client_message_from_dataframe(df, ind)
            AgentMessage = mdb.get_agent_message_from_dataframe(df, ind)
            return (AgentMessage, ClientMessage)
        except Exception as inst:
            print type(inst)  # the exception instance
            print inst.args  # arguments stored in .args
            print inst  # __str__ allows args to be printed directly
示例#10
0
# Add app folder to PYTHONPATH

from vectorisations.vectorization import Word2Vec, SentenceVectorizer, SearchTree

import os.path

filename = os.path.join("/Users", "piromast", "Dropbox", "S2DS - M&S", "Data",
                        "04_clientMessagesFiltered.txt")
W2V = Word2Vec(filename)  # Train word2vec with file in filename
vectorizer = SentenceVectorizer(W2V)  # make sentence vectorizer object
matrix = vectorizer.get_sentences_matrix(
    filename)  # calculate matrix of sentence vectors of whole text file
tree = SearchTree(matrix)  # Generate search tree (to find nearest vector)

# Lookup example
v = vectorizer.get_sentence_vector(
    "I lost my order")  # Convert sentence to vector
index, distance = tree.findNearestVector(v)