示例#1
0
    def init(self, name, removeStopWords=True, faq_gsheet=None, hash_featurez=False):  
        super().init(name) 
        src = 'RSNNModel.init'

        # these are dict( class_cat : response | question)
        self.faq_responses_db, self.faq_classify_phrases_db = dataSource.doGSheet_FAQ(faq_gsheet, dataSource.zGSHEET_FAQ) 
        # self.num_classes = len( self.faq_responses_db.keys() ) 

        self.class_categoriez = {} # force labelz into numerics TODO: refine reuse @priors
        for i, v in enumerate( self.faq_responses_db.keys() ) :
            self.class_categoriez[v] = i
        self.class_categoriez["I don't know yet. Will learn more"] = len(self.class_categoriez) 
        
        self.num_classes = len( self.class_categoriez )

        zlogger.log(src, self.showParams() )
        
        ## feature mappers
        feature_col_mapper = [
            tf.feature_column.embedding_column(
                categorical_column = tf.feature_column.categorical_column_with_hash_bucket(key='user_que', hash_bucket_size=100 ) if hash_featurez 
                    else tf.feature_column.categorical_column_with_identity('user_que'), 
                dimension = int(self.num_classes ** 0.25) 
            ) 
        ]


        self.model = tf.estimator.DNNClassifier(
            feature_columns = feature_col_mapper,
            hidden_units = [256, 128], # hyper params?? TODO: ref and param
            model_dir = self.getModelFPath(),
            n_classes = self.num_classes,
            # label_vocabulary = list( self.class_categoriez.keys() ) # if defined here then labelz can be string in map_input_fn below 
            
        )
示例#2
0
def writeTo(content, dpath, dtype=zFILE, mode=MODE_WRITE):
    res = STREAMZ.get(dtype, doFile)
    zlogger.log("dataSource.writeTo", "dpath = {}".format(dpath))
    res(
        dpath,
        mode=mode,
        content=content,
    )
示例#3
0
 def dump(self, fpath=None):
     fpath = self.model_fpath if fpath is None else fpath
     try:
         with open(fpath, "wb") as fd:
             pickle.dump(self.model, fd)
             zlogger.log("{}.model.dump".format(self.__class__),
                         "Model saved to file successfully")
     except:
         zlogger.logError("{}.model.dump".format(self.__class__),
                          "Pickle to File - {}".format(fpath))
    def predict(self, input_text):
        idx = None
        clean_encoded_text = self.preprocessText(input_text)
        zlogger.log('mlp.predict', "IN: {}".format(repr(clean_encoded_text)))

        idx = self.model.predict(clean_encoded_text)

        zlogger.log('mlp.predict', "ANS: {}".format(idx))

        return idx
示例#5
0
    def dumpSave(self, fpath=None):
        fpath = self.getModelFPath(fpath)

        try:
            with open(fpath, "wb") as fd:
                pickle.dump(self.persist, fd)
                zlogger.log("{}.model.dump".format(self.__class__),
                            "Model saved to file successfully")
        except:
            zlogger.logError("{}.model.dump".format(self.__class__),
                             "Pickle to File - {}".format(fpath))
示例#6
0
 def load(self, fpath=None):
     # 1. model definition
     super().load(fpath)
     # 2. training data
     fpath = "{}.dat".format(self.model_fpath if fpath is None else fpath)
     try:
         with open(fpath, "rb") as fd:
             self.dataset = pickle.load(fd)
             zlogger.log("{}.dataset.load".format(self.__class__),
                         "Dataset loaded from file successfully")
     except:
         zlogger.logError("{}.dataset.load".format(self.__class__),
                          "Pickle to File - {}".format(fpath))
示例#7
0
    def load(self, fpath=None):
        self.name = getClassName() if fpath is None else re.search(
            '(.*)\.zmd', fpath)[1]
        fpath = self.getModelFPath() if fpath is None else fpath

        try:
            with open(fpath, "rb") as fd:
                self.model = pickle.load(fd)
                zlogger.log("{}.model.load".format(self.__class__),
                            "Model loaded from file successfully")
        except:
            zlogger.logError("{}.model.load".format(self.__class__),
                             "Pickle to File - {}".format(fpath))
示例#8
0
 def dump(self, fpath=None):
     # 1. model definition
     super().dump(fpath)
     # 2. training data
     fpath = "{}.dat".format(self.model_fpath if fpath is None else fpath)
     try:
         with open(fpath, "wb") as fd:
             pickle.dump(self.dataset, fd)
             zlogger.log("{}.dataset.dump".format(self.__class__),
                         "Dataset dumped to file successfully")
     except:
         zlogger.logError("{}.dataset.dump".format(self.__class__),
                          "Pickle to File - {}".format(fpath))
    def predict(self, input_text):
        zlogger.log('cosine.predict', "IN: {}".format(repr(input_text)))

        zlogger.log('cosine.predict',
                    "IN.PREPROC: {}".format(repr(self.preprocessor)))

        clean_input_text = self.preprocessText(input_text)

        zlogger.log('cosine.predict',
                    "IN.CLEAN: {}".format(repr(clean_input_text)))

        # if not isinstance(clean_input_text, str):
        #     clean_input_text = " ".join( list( clean_input_text) )

        input_vec = self.model.transform(clean_input_text)
        valz = cosine_similarity(input_vec, self.trained_matrix)
        idx = valz.argsort()[0][-1]

        # zlogger.log('cosine.predict', "ANS: {}".format( idx ) )

        # flatz = valz.flatten()
        # flatz.sort()
        # resp = flatz[-1]
        resp = valz[0][idx]
        if resp <= self._predict_threshold:  ## TODO threshold it at .5
            idx = None

        zlogger.log('CosSimilarity.Predict',
                    "idx = {}, resp= {}".format(idx, resp))
        return idx
示例#10
0
def splitTrainTest(clean_data, test_prop=0.2): 
    the_data = np.array( clean_data ) 
    
    zlogger.log('splitTrainTest', "Provided data size = {}\n{}".format( len(the_data), the_data[0] ) ) 

    n_recs = len( the_data ) 
    n_test = math.trunc( 0.2*n_recs ) 
    # shuffle 
    np.random.shuffle( the_data )
    # split 
    train_data, test_data = the_data[:(n_recs-n_test)], the_data[(n_recs-n_test): ]
    #TODO: should we flatten and when

    return list(train_data), list(test_data )
示例#11
0
    def predict(self, observation):
        src = 'RSNNModel.predict'

        def map_input_predict_data():
            return { 'user_que': np.array([observation]) }  

        def fetchReponse(pred):
            pclass = self.class_categoriez.get(pred)
            return self.faq_responses_db.get( pclass, "I don't seem to know about that yet. I'll find out more")
            
        pred = self.model.predict( input_fn = map_input_predict_data ) 
        # pred = np.array( pred.get('predictions') ).argmax() 
        zlogger.log( src, " predicted value = {} e.g. {}".format( pred, pred ) ) 

        return fetchReponse( pred ) 
    def loadDump(self, fpath=None):
        ## 1. load other objects
        ZModel.loadDump(self, fpath)

        ## 2. load keras model
        krs_path = self.persist['model']
        zlogger.log('NgramMLP.loadDump', "Loading From: {}".format(krs_path))

        self.model = keras.models.load_model(krs_path)

        # ## load keras model as json
        # with open( krs_path, 'r') as fd:
        #     self.model = keras.models.model_from_json( fd.read() )
        # # and the weights
        # self.model.load_weights( "{}.h5".format(krs_path) )

        zlogger.log('NgramMLP.loadDump',
                    "FIN: {}".format(self.model.summary()))
示例#13
0
    def loadDump(self, fpath=None):
        fpath = self.getModelFPath(fpath)

        def unpackPersist():
            if self.persist is not None:
                for k, v in self.persist.items():
                    setattr(self, k, v)

        try:
            with open(fpath, "rb") as fd:
                self.persist = pickle.load(fd)
                zlogger.log("{}.model.load".format(self.__class__),
                            "Model loaded from file successfully")
        except:
            zlogger.logError("{}.model.load".format(self.__class__),
                             "Pickle to File - {}".format(fpath))

        unpackPersist()
        zlogger.log("{}.model.load".format(self.__class__),
                    "Persist unpacked successfully")
示例#14
0
    def dumpLoad(self, data_path=None, data_type=None):
        self.initz()
        ##TODO: reconcile 
        self.data_path = data_path 

        dpath = self.data_path if data_path is None else data_path 
        dtype = self.data_type if data_type is None else data_type

        filez = self.getDumpLoadItems() 

        for ext, db in filez.items():
            tf = "{}.{}".format(dpath, ext) 
            if os.path.exists( tf ): 
                setattr( self, db, zdata_source.readFrom( tf, dtype=dtype)  ) 
                zlogger.log('zdataset.dumpLoad', "Loaded {} of size {}".format( tf, len(getattr(self, db) ) ) ) 
            else:
                zlogger.log('zdataset.dumpLoad', "Not Found: {}".format( tf) ) 
        
        self.data = self.clean_data 
        self.updateXIndex()
        self.updateYIndex()
示例#15
0
def runBot(isGsheetDB=False, model_type=botLogic.MODEL_TFIDF,): 
    src = "nCoV19.runBot"    
    
    zlogger.log(src, "Starting")

    # 1. setup bot
    bot = BotLogicFlow()
    bot.initializeModel( model_type,  "{}.zmd".format(app_name) )

    #2. run bot 
    while( 1 ):
        user_input = input( colored("Talk to me: ", "yellow") )
        prompt = colored( ">>>: ", "green") 

        response, rcode = bot.getResponse( user_input ) 

        if isGsheetDB and response and rcode == BotLogicFlow.RCODE_LEARNT_RESPONSE:
            idx = gsheet_faq_training_set_db.get( response, None) # fetch class name  
            
            zlogger.log( src, idx ) 
            
            response = gsheet_faq_db.get(idx, "I don't know that yet. I'll find out more") 

        print( "{} {}\n".format(prompt, "I don't understand. Try that again" if response is None else response )  ) 
        
        if ( rcode == BotLogicFlow.RCODE_EXIT_RESPONSE) :
            break 
    
    zlogger.log(src, "Finished")
示例#16
0
def init_app(config_obj=conf): 
    zlogger.log( "app.init_app", f"{config_obj}" ) 
    
    zlogger.log( "app_pkg.py", f": {__name__}" )
    
    app = Flask(__name__) 
    app.config.from_object( config_obj ) 

    db.init_app( app )
    login_manager.init_app( app ) 
    bcrypt.init_app( app ) 

    from <app_pkg_name>.errors.handlers import errors 
    from <app_pkg_name>.faq.routes import faqs       

    app.register_blueprint( errors ) 
    app.register_blueprint( faqs )  
    
    ## TODO: disable if not in use
    with app.app_context():
        db.create_all()
        db.session.commit()

    return app 
示例#17
0
def initiailizeBotEnv(src_path, src_type=dataSource.zFILE, model_type=botLogic.MODEL_TFIDF, nostopwords=True): 
    global model , gsheet_faq_db, gsheet_faq_training_set_db 

    src = "nCoV19_bot.initiailze"

    # 1. fetch data text 
    list_sentz = None 
    if src_type == dataSource.zGSHEET_FAQ:
        gsheet_faq_db, gsheet_faq_training_set_db = dataSource.doGSheet_FAQ(src_path, src_type)
        list_sentz = list( gsheet_faq_training_set_db.keys()  ) 
    else:
        list_sentz = dataSource.readFrom( src_path, src_type)   
        
    zlogger.log(src, "Loaded data text of size {}".format(len(list_sentz) ) ) 
    
    # 2. initialize and train model 
    model = botLogic.AVAILABLE_MODELZ.get(model_type, TfidfModel)
    model = model() 
    model.init( app_name, removeStopWords=nostopwords ) 
    model.train( list_sentz ) 
    zlogger.log(src, "Initiailized & Trained TF-IDF Model {}".format(model) ) 

    # 3. save model
    model.dump() 
示例#18
0
    def getResponse(self, user_input_text): 
        response = None
        rcode = self.RCODE_KNOWN_RESPONSE

        key_words = cleanup_and_lemmatize( user_input_text ) 
        
        was_que = True

        for word in key_words:
            if word in self.GREETINGZ_INPUT:
                response = random.choice( self.GREETINGZ_RESPONSE) 
                was_que = False
                break
            elif word in self.THANKS_INPUT:
                response = random.choice( self.THANKS_RESPONSE )
                was_que = False
                break
            elif word in self.EXIT_INPUT:
                response = random.choice( self.THANKS_RESPONSE )+". "+random.choice( self.EXIT_RESPONSE )
                rcode = self.RCODE_EXIT_RESPONSE 
                return response, rcode 

        if was_que:
            pred_cat = self.model.predict( user_input_text )
            zlogger.log("bot.Predicted", "IN = {}".format( repr(pred_cat ) ) )
            
            if isinstance( pred_cat, list):
                pred_cat = pred_cat[0] 
            pred_cat, response = self.dset.getPredictedAtIndex( pred_cat ) 
            zlogger.log("bot.Predicted", "Class = {}".format( repr(pred_cat ) ) )

            if isinstance( response, list):
                response, response_src, response_link, *_ = response
            rcode = self.RCODE_LEARNT_RESPONSE 

        return response, rcode 
示例#19
0
    def predict(self, observation):
        raise NotImplementedError

    '''
    Input: 
        train_x : a list of sentences
    '''
    def train(self, train_x, train_y=None, test_x=None, test_y=None): 
        raise NotImplementedError


if __name__ == "__main__":
    src = 'rnnModel.main'
    dpath = [ ('1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks', 'FAQ responses!A1:G1000'), ('1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks', 'Classify_Phrases!A1:G1000')]
    
    zlogger.log(src, 'STARTING')

    rnn = RetrievalSupervisedNNModel()
    rnn.init('DNNClassifier', faq_gsheet= dpath, hash_featurez=True ) 
    zlogger.log(src, "Model is:\n{}".format( rnn ) )

    #train
    rnn.train()
    zlogger.log(src, 'Done Training. Moving on to predict using trained model')

    #predict
    sentz = [ "Is my cat sick", "Is my cat sick with the virus", "Can an insect infect me", "What is corana", "What is corana virus", "What is covid-19"]

    for s in sentz:
        r = rnn.predict( s  ) 
        zlogger.log("{}.predict".format(src), "\n{} ==> {} ".format( s, r ) )
示例#20
0
 def train(self, train_x=None, train_y=None, test_x=None, test_y=None, epochs=25000): 
     self.model.train(input_fn=self.map_input_train_data , steps = epochs) 
     zlogger.log('RSNNModel.train', "FINISHED: {} epochs".format(epochs) ) 
示例#21
0
        else:
            # return "{}\n\t{}".format(sent_tokenz[ idx ] , sent_tokenz[ idx+1])
            return "{}".format(sent_tokenz[idx])

    '''
    Input: 
        train_x : a list of sentences
    '''

    def train(self, train_x, train_y=None, test_x=None, test_y=None):
        self.dataset = train_x if isinstance(
            train_x, list) else nltk.sent_tokenize(train_x)


if __name__ == "__main__":
    zlogger.log("tfidfModel.main", "Starting")

    src = "tfidfModel.main.test"
    named = "TFIDF_ChatBot"

    st = "The quick brown fox jumped over the lazy dogs. This is an account of a lost dog. His name was Jazzy and he had 7 bones. Hey there! Okay, bye."

    for ist in [True, False]:
        wt = "Without" if ist else "With"
        zlogger.log(src, "\n\n{0} {1} Stop Words {0}".format("-" * 7, wt))

        m = TfidfModel()
        m.init(named, removeStopWords=ist)
        m.train(st)

        zlogger.log(src, "Data: {}\nModel: {}\n".format(st, m))
示例#22
0
            if v in ln.lower():
                tmp[i] = 1
        vect.append(tmp)

    return vocab, np.array(vect)


'''
Input: dataset to be operated on
Output: dataset after operation 
'''

if __name__ == "__main__":
    st = "The quick brown fox jumped over the lazy dogs. This is an account of a lost dog. His name was Jazzy and he had 7 bones. Hey there! Okay, bye."

    zlogger.log("dataSet.main", "Starting")

    src = "dataSet.main.example"

    df = initiateDataset(st)
    zlogger.log(src, "Dataset of {} lines".format(len(df)))
    print(df, "\n")

    tokenz = wordTokenizeWithoutPunctuations(df)
    vocab = getVocabList(df)
    zlogger.log(
        src, "There are {} words and {} vocab".format(len(tokenz), len(vocab)))
    print("{}\n{}\n".format(tokenz, vocab))

    vocab, matrix = oneHotEncode_LemmaBagOfWords(st)
    zlogger.log(src, "Vocab = {} Matrix = {}".format(len(vocab), matrix.shape))
示例#23
0
        }
        return { **tmp , **tmp2}

    def getPredictedAtIndex(self, y_index): 
        if y_index is None:
            return None, None
        # zlogger.log( 'zdataset.get_y-at', "IN: {}".format(y_index ) )
        class_cat = self.y_labelz[ y_index ] 
        # zlogger.log( 'zdataset.get_y-at', "CAT: {}".format( class_cat ) ) 
        return class_cat, self.faq_db.get( class_cat , None )  

###########################################################

if __name__ == "__main__":
    src = "dataset.main"
    zlogger.log(src, ">>>>> STARTING\n")
    
    st = "The quick brown fox jumped over the lazy dogs. This is an account of a lost dog. His name was Jazzy and he had 7 bones. Hey there! Okay, bye." 
    # st = nltk.sent_tokenize( st )

    ds = ["The quick brown fox", "He had 7 bones"] 

    ps = "The brown bones" # predict text 
    
    tokz = lemmatizeTokens(st)
    print( "Tokens len: {}\n{}\n".format( len(tokz), tokz) )
    

    dset = ZDataset()
    dset.initFromSeq( ds ) 
    dset.preprocess()     
示例#24
0
Return: 
'''


def writeTo(content, dpath, dtype=zFILE, mode=MODE_WRITE):
    res = STREAMZ.get(dtype, doFile)
    zlogger.log("dataSource.writeTo", "dpath = {}".format(dpath))
    res(
        dpath,
        mode=mode,
        content=content,
    )


if __name__ == "__main__":
    zlogger.log("dataSource.main", "Starting")

    arange = 'FAQ responses!A1:G1000'
    gsheet_id = '1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks'  #covid_19_faq
    # gsheet_id = 'covid_19_faq'

    etype = ['Text File', 'PDF', 'Article', 'Site', 'Serialized', 'GSheet']
    etype_i = [zFILE, zPDF, zARTICLE, zNESTED_ARTICLES, zSERIALIZED, zGSHEET]
    epath = [
        'example.txt', 'example.pdf',
        'https://www.nation.co.ke/counties/nairobi/Police-kill-ATM-heist-mastermind/1954174-5503356-aodphx/index.html',
        'https://www.standardmedia.co.ke/corporate/news', 'example.byt',
        (gsheet_id, arange)
    ]
    econtent = [
        'The quick brown fox jumper over the lazy dogs.' * 7, None, None, None,
示例#25
0
            elif word in self.EXIT_INPUT:
                response = random.choice(
                    self.THANKS_RESPONSE) + ". " + random.choice(
                        self.EXIT_RESPONSE)
                rcode = self.RCODE_EXIT_RESPONSE
                return response, rcode

        if was_que:
            response = self.model.predict(user_input_text)
            rcode = self.RCODE_LEARNT_RESPONSE

        return response, rcode


if __name__ == "__main__":
    zlogger.log("botLogic.main", "Starting")

    bot = BotLogicFlow()
    bot.initializeModel(BotLogicFlow.MODEL_TFIDF, "TFIDF_ChatBot.zmd")

    while (1):
        user_input = input(colored("Talk to me: ", "yellow"))
        prompt = colored(">>>: ", "green")

        response, rcode = bot.getResponse(user_input)

        print("{} {}\n".format(
            prompt, "I don't understand. Try that again"
            if response is None else response))

        if (rcode == -99):
示例#26
0
import <app_pkg_name>
from <app_pkg_name>.bin import zlogger 

app = <app_pkg_name>.init_app() 

zlogger.startLogger("<app_pkg_name>")

if __name__ == "__main__":
    '''
    TODO: populate dummy data, setup zlogger 
    ''' 
    zlogger.log( "run.py" f"starting {__name__}" ) 

    app.run(debug=True)