Пример #1
0
 def __init__(self, configDictionary):
     super(MLBrain, self).__init__(configDictionary)
     self.profile = {"name": "mlbrain-module", "class": "mlbrain"}
     self.nlp_module = None
     self.concept_module = None
     self.data_storage_path = utils.getKeyFromSectionInConfiguration(
         'brain', 'data_storage_path', None, configDictionary)
     self.dictionary_data_source_path = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'dictionary_data_source_path', None,
         configDictionary)
     self.dictionary_output_path = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'dictionary_output_path', None,
         configDictionary)
     if self.dictionary_output_path == None:
         print(
             '**** ERROR: No Dictionary output path defined in bender-training section of config-file.'
         )
         sys.exit(1)
     self.dict_filename = os.path.join(self.dictionary_output_path,
                                       'dictionary.dict')
     self.remove_stop_words = int(
         utils.getKeyFromSectionInConfiguration('bender-training',
                                                'remove_stop_words', '0',
                                                configDictionary))
     self.language = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'data_language_short', 'en', configDictionary)
     self.stop_words = get_stop_words(self.language)
     utils.safe_create_directory(self.dictionary_output_path)
     if self.data_storage_path == None:
         print('**** ERROR: No data storage path specified. Exiting!')
         sys.exit(1)
     self.dictionary_manager = GensimDictionary(
         self.dictionary_data_source_path, self.dict_filename,
         self.remove_stop_words, self.stop_words)
     self._loadData()
Пример #2
0
    def __init__(self, configDictionary):
        super(HunSpelling, self).__init__(configDictionary)
        self.profile = {
                "name" : "hunspelling-module",
                "class": "spelling",
                "supported-languages" : ["de", "en", "tr"]
                } 

        self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
        self.dict_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-dict-file', None, configDictionary)
        if self.dict_file == None:
            print('*** Missing spelling-dict-file in configuration. Exiting.')
            sys.exit(1)

        self.aff_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-aff-file', None, configDictionary)
        if self.dict_file == None:
            print('*** Missing spelling-aff-file in configuration. Exiting.')
            sys.exit(1)

        self.add_words_file = utils.getKeyFromSectionInConfiguration('spelling', 'training-add-words-from-file', None, configDictionary)

        self.speller = hunspell.HunSpell(self.dict_file, self.aff_file)
        if self.speller == None:
            print('>>>>>> Could not create speller...')
        tokenizer_language = utils.getKeyFromSectionInConfiguration('spelling', 'tokenizer-language', 'german', configDictionary)
        try:
            self.tokenizer = nltk.data.load('tokenizers/punkt/{0}.pickle'.format(tokenizer_language))
        except:
            print('>>>>>> Could not load TOKENIZER language file.')
            sys.exit(1)

        if self.add_words_file != None:
            self.train()
Пример #3
0
 def __init__(self, configDictionary):
     super(SpacySIM, self).__init__(configDictionary)
     self.profile = {
         "name": "spacy-module",
         "class": "nlp-deepnn-analysis",
         "supported_request_types": ["text"],
         "supported-languages": ['de', 'en']
     }
     self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
     self.maxSim = int(
         utils.getKeyFromSectionInConfiguration('similarity',
                                                'max_similarity_to_return',
                                                1, configDictionary))
     self.language_model = utils.getKeyFromSectionInConfiguration(
         'similarity', 'language_model', 'de_core_news_sm',
         configDictionary)
     self.simLowerThreshold = float(
         utils.getKeyFromSectionInConfiguration(
             'similarity', 'similarity_lower_threshold', 0.5,
             configDictionary))
     self.simHigherThreshold = float(
         utils.getKeyFromSectionInConfiguration(
             'similarity', 'similarity_higher_threshold', 0.7,
             configDictionary))
     self.configDictionary = configDictionary
     self.update_lock = threading.Lock()
     self.alldocs = None
Пример #4
0
    def __init__(self, configDictionary):
        super(EnchantSpelling, self).__init__(configDictionary)
        self.profile = {
            "name": "enchantspelling-module",
            "class": "spelling",
            "supported-languages": ["de", "en", "tr"]
        }

        self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
        self.dict_language = utils.getKeyFromSectionInConfiguration(
            'spelling', 'spelling-language-full', None, configDictionary)
        try:
            self.speller = enchant.Dict(self.dict_language)
        except:
            print('>>>>>>> Could not load language spelling dictionary ',
                  self.dict_language)
            sys.exit(1)

        self.add_words_file = utils.getKeyFromSectionInConfiguration(
            'spelling', 'training-add-words-from-file', None, configDictionary)

        tokenizer_language = utils.getKeyFromSectionInConfiguration(
            'spelling', 'tokenizer-language', 'german', configDictionary)
        try:
            self.tokenizer = nltk.data.load(
                'tokenizers/punkt/{0}.pickle'.format(tokenizer_language))
        except:
            print('>>>>>> Could not load TOKENIZER language file.')
            sys.exit(1)

        if self.add_words_file != None:
            self.train()
Пример #5
0
 def __init__(self, moduleConfigSection, configDictionary):
     super(WMDLogicMT, self).__init__(moduleConfigSection, configDictionary)
     global WMD_FILE
     self.profile = {
         "name":
         "wmd-logic",
         "class":
         "internalmachine-logic",
         'accepted-languages': [
             'de', 'en', 'fr', 'tr', 'it', 'nl', 'se', 'no', 'fi', 'pl',
             'cz', 'hu'
         ],
         'accepted-media-types': ['text/utf8'],
         'returned-media-types': ['text/utf8'],
         'requires-original-query':
         True,
         'returns-response-id':
         True,
         'always-ask':
         True
     }
     self.module_config = moduleConfigSection
     self.config_dict = configDictionary
     language = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'data_language_short', 'en', self.config_dict)
     self.remove_stop_words = int(
         utils.getKeyFromSectionInConfiguration('bender-training',
                                                'remove_stop_words', 1,
                                                self.config_dict))
     retraining_interval_mins = int(
         self.module_config.get('retraining_interval_in_minutes', 23))
     if retraining_interval_mins < 5:
         retraining_interval_mins = 5
     self.retraining_interval_in_seconds = retraining_interval_mins * 60
     self.stop_words = get_stop_words(language)
     self.higher_threshold = float(
         self.module_config.get('wmd_higher_threshold', 0.7))
     self.lower_threshold = float(
         self.module_config.get('wmd_lower_threshold', 0.5))
     self.num_results = int(self.module_config.get('max_wmd_results', '10'))
     self.num_instances = int(
         self.module_config.get('wmd_num_instances', '10'))
     self.wmd_timeout = int(self.module_config.get('wmd_timeout', '30'))
     self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
     self.is_master = int(self.module_config.get('is-master', 0))
     self.contribution_factor = int(
         self.module_config.get('contribution-factor', 500))
     self.wmd_model = None
     self.wmd_instances = []
     self.query_results = {}
     self.process_queues = {}
     self.chunk_size = 0
     self.learning_update_timer = None
     self.learning_lock = threading.Lock()
     self.requires_learning = False
     self.wmd_instances_lock = threading.Lock()
Пример #6
0
    def __init__(self, configDictionary):
        self.configDictionary = configDictionary
        self.train_data_source_file = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'train_data_source_file', None,
            configDictionary)
        if not self.train_data_source_file:
            print(
                "Config does not contain 'train_data_source_file', please provide one."
            )
            exit(1)

        self.query_media_type = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'query_media_type', None, configDictionary)
        self.response_media_type = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'response_media_type', None, configDictionary)
        self.raw_data_format = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'raw_data_format', None, configDictionary)
        self.train_data_q_media_type = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'train_data_q_media_type', None,
            configDictionary)
        self.train_data_a_media_type = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'train_data_a_media_type', None,
            configDictionary)
        self.output_path = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'output_path', None, configDictionary)
        self.train_data_queries_root_dir = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'converted_train_data_q_path', None,
            configDictionary)
        self.train_data_answers_dir = utils.getKeyFromSectionInConfiguration(
            'bender-training', 'converted_train_data_a_path', None,
            configDictionary)
        self.generate_lsi = int(
            utils.getKeyFromSectionInConfiguration('bender-training',
                                                   'generate_lsi', 0,
                                                   configDictionary))
        concept = utils.getModulenameFromConfiguration(
            'concept', 'modules.concept.mlconcept.MLConcept', configDictionary)
        utils.validate_module_class(concept, MLConcept)
        self.concept = utils.initialize_class(concept, configDictionary)

        nlp = utils.getModulenameFromConfiguration('nlp',
                                                   'modules.nlp.mlnlp.MLNLP',
                                                   configDictionary)
        utils.validate_module_class(nlp, MLNLP)
        self.nlp = utils.initialize_class(nlp, configDictionary)
        utils.safe_create_directory(self.output_path)
        self.configDictionary = configDictionary
        self.question_file = ''
        self.answers_file = ''
        self.questions = []
        self.answers = []
        self.internalMachineLogics = []
Пример #7
0
 def __init__(self, configDictionary):
     super(MongoDBStorage, self).__init__(configDictionary)
     self.database_name = utils.getKeyFromSectionInConfiguration(
         'permanentstorage', 'mongodb_storage_database', 'bender_database',
         configDictionary)
     self.database_uri = utils.getKeyFromSectionInConfiguration(
         'permanentstorage', 'mongodb_storage_database_uri',
         'mongodb://localhost:27017/', configDictionary)
     self.client = MongoClient(self.database_uri)
     self.database = self.client[self.database_name]
     self.entries = self.database['benderentries']
     # We need to create indexes, but this is left for later...
     # self.statements.create_index('ownerID', unique=True)
     self.profile = {
         "name": "mongodb-storage",
         "class": "permanent-storage",
         "verson": "1.0"
     }
Пример #8
0
 def __init__(self, brain, configDict):
     self.config_dict = configDict
     self.dict_source_dir = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'dictionary_data_source_path', None, configDict)
     self.train_data_source_dir = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'converted_train_data_q_path', None, configDict)
     self.num_topics = int(
         utils.getKeyFromSectionInConfiguration('bender-training',
                                                'num_topics_lsi', 200,
                                                configDict))
     self.output_root_path = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'output_path', None, configDict)
     self.dict_filename = self.output_root_path + '/dictionary/dictionary.dict'
     self.corpus_filename = self.output_root_path + '/corpus/corpus.mm'
     self.tfidf_corpus_filename = self.output_root_path + '/corpus/tfidf.mm'
     self.lsi_filename = self.output_root_path + '/models/lsi.model'
     self.tfidf_model_filename = self.output_root_path + '/models/tfidf.model'
     self.lsi_index_filename = self.output_root_path + '/index/lsi_index'
     self.tfidf_idx_filename = self.output_root_path + '/index/tfidf.index'
     self.doc2id_filename = self.output_root_path + '/ids/word2id.pickle'
     self.id2doc_filename = self.output_root_path + '/ids/id2word.pickle'
     self.language = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'data_language_short', 'en', configDict)
     self.remove_stop_words = int(
         utils.getKeyFromSectionInConfiguration('bender-training',
                                                'remove_stop_words', 1,
                                                configDict))
     self.stop_words = get_stop_words(self.language)
     self.brain = brain
     self._create_directories()
Пример #9
0
 def __init__(self, brain, configDict):
     self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
     self.config_dict = configDict
     self.lsi_index_mutex = threading.Lock()
     self.num_topics = int(
         utils.getKeyFromSectionInConfiguration('bender-training',
                                                'num_topics_lsi', 200,
                                                configDict))
     self.output_root_path = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'output_path', None, configDict)
     self.corpus_filename = self.output_root_path + '/corpus/corpus.mm'
     self.tfidf_corpus_filename = self.output_root_path + '/corpus/tfidf.mm'
     self.lsi_filename = self.output_root_path + '/models/lsi.model'
     self.tfidf_model_filename = self.output_root_path + '/models/tfidf.model'
     self.lsi_index_filename = self.output_root_path + '/index/lsi_index'
     self.tfidf_idx_filename = self.output_root_path + '/index/tfidf.index'
     self.doc2id_filename = self.output_root_path + '/ids/word2id.pickle'
     self.id2doc_filename = self.output_root_path + '/ids/id2word.pickle'
     self.word2vec_filename = os.path.join(self.output_root_path, 'vectors',
                                           'word2vec.embeddings')
     self.language = utils.getKeyFromSectionInConfiguration(
         'bender-training', 'data_language_short', 'en', configDict)
     self.remove_stop_words = int(
         utils.getKeyFromSectionInConfiguration('bender-training',
                                                'remove_stop_words', 1,
                                                configDict))
     self.stop_words = get_stop_words(self.language)
     self.brain = brain
     retraining_interval_mins = int(
         utils.getKeyFromSectionInConfiguration(
             'bender-training', 'retraining_interval_in_minutes', 30,
             configDict))
     if retraining_interval_mins < 5:
         retraining_interval_mins = 5
     self.retraining_interval_in_seconds = retraining_interval_mins * 60
     self.learning_update_timer = None
     self.learning_lock = threading.Lock()
     self.requires_learning = False
     self._load()
Пример #10
0
 def __init__(self, configDictionary):
     super(LSISimilarity, self).__init__(configDictionary)
     self.profile = {
         "name": "lsi-module",
         "class": "latent-semantic-analysis",
         "supported_request_types": ["text"],
         "supported-languages": ["de"]
     }
     self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
     self.maxSim = int(
         utils.getKeyFromSectionInConfiguration('similarity',
                                                'max_similarity_to_return',
                                                1, configDictionary))
     self.simLowerThreshold = float(
         utils.getKeyFromSectionInConfiguration(
             'similarity', 'similarity_lower_threshold', 0.5,
             configDictionary))
     self.simHigherThreshold = float(
         utils.getKeyFromSectionInConfiguration(
             'similarity', 'similarity_higher_threshold', 0.7,
             configDictionary))
     self.configDictionary = configDictionary
Пример #11
0
 def __init__(self, configDictionary):
     super(MLSession, self).__init__(configDictionary)
     self.profile = {
         "name": "mlsession-module",
         "class": "session",
         "session-timeout": 10400
     }
     self.wordTokenizer = RegexpTokenizer(u'\w+')
     self.session_timeout = int(
         utils.getKeyFromSectionInConfiguration(
             'session', 'session_timeout', self.profile['session-timeout'],
             configDictionary))
     self.sessions = {}
Пример #12
0
 def __init__(self, configDictionary):
     self.config = configDictionary
     self.key = 'nmxcvjkhsdf98u53429kjhasd901423jkhdsfzcxvmnuitgre4325809cneu3io'
     log_path = utils.getKeyFromSectionInConfiguration(
         'bender-core', 'log_directory', 'logs', configDictionary)
     hash_key = utils.getKeyFromSectionInConfiguration(
         'bender-core', 'log_hash', None, configDictionary)
     self.lock = threading.Lock()
     if hash_key == None:
         logger = logging.getLogger(os.path.basename(sys.argv[0]))
         logger.error(
             'Missing `log_hash` in `bender-core` section of configuration file. Will exit!'
         )
         sys.exit(1)
     new_hash_key = self.key + hash_key
     self.hash_key = hashlib.sha256(new_hash_key).hexdigest()
     requests_log_file = os.path.join(log_path, 'requests.log')
     if os.path.exists(requests_log_file):
         allentries = codecs.open(requests_log_file, 'r',
                                  'utf-8').readlines()
         if len(allentries) > 0:
             self.last_req_log_entry = allentries[-1].strip()
         else:
             self.last_req_log_entry = None
     else:
         self.last_req_log_entry = None
     self.requests_log_file = codecs.open(requests_log_file, 'a', 'utf-8')
     p_log_file = os.path.join(log_path, 'performance.log')
     if os.path.exists(p_log_file):
         allentries = codecs.open(p_log_file, 'r', 'utf-8').readlines()
         if len(allentries) > 0:
             self.last_log_entry = allentries[-1].strip()
         else:
             self.last_log_entry = None
     else:
         self.last_log_entry = None
     self.performance_log_file = codecs.open(p_log_file, 'a', 'utf-8')
Пример #13
0
    def _loadAnnoyIndex(self):
        self.loading_lock.acquire()
        self.dictionary = self.brain.getDictionary()
        self.num_topics_lsi = int(
            utils.getKeyFromSectionInConfiguration('bender-training',
                                                   'num_topics_lsi', 200,
                                                   self.config_dict))
        self.output_path = self.module_config.get('annoy_data_path', '')
        self.accuracy = int(self.module_config.get('accuracy', 500))
        self.max_results = int(self.module_config.get('max_results', 100))
        retraining_iv = int(
            self.module_config.get('retraining_interval_in_minutes', 7))
        self.retraining_interval_in_seconds = retraining_iv * 60
        utils.safe_create_directory(self.output_path)
        self.lsi_vectors_filename = os.path.join(self.output_path,
                                                 LSI_VECTORS_FNAME)
        self.lsi_model_filename = os.path.join(self.output_path,
                                               LSI_MODEL_FNAME)
        self.tfidf_model_filename = os.path.join(self.output_path,
                                                 TFIDF_MODEL_FNAME)
        self.annoy_index_filename = os.path.join(self.output_path,
                                                 ANNOY_OUTPUT_FNAME)
        self.clipped_corpus_filename = os.path.join(self.output_path,
                                                    CLIPPED_CORPUS_FNAME)
        if os.path.exists(self.annoy_index_filename):
            self.mm = gensim.corpora.MmCorpus(self.lsi_vectors_filename)
            num_features, num_docs = self.mm.num_terms, min(
                self.mm.num_docs, MAX_DOCS)
            self.index_annoy = annoy.AnnoyIndex(num_features, metric='angular')
            self.index_annoy.load(self.annoy_index_filename)
        else:
            print(
                '**** ERROR: Annoy index does not exist. Please train first!')
            sys.exit(1)

        if os.path.exists(self.lsi_model_filename):
            self.lsi_model = gensim.models.LsiModel.load(
                self.lsi_model_filename)
        else:
            print('**** ERROR: Annoy LSI Model missing. Please train first!')
            sys.exit(1)

        if os.path.exists(self.tfidf_model_filename):
            self.tfidf_model = gensim.models.TfidfModel.load(
                self.tfidf_model_filename)
        self.loading_lock.release()
Пример #14
0
 def __init__(self, configDictionary):
     super(JSONStorage, self).__init__(configDictionary)
     self.entries = {}
     jsonDatabaseFile = utils.getKeyFromSectionInConfiguration(
         'permanentstorage', 'json_storage_database',
         '/tmp/-json-storage.json', configDictionary)
     self.mutex_lock = threading.Lock()
     self.jsonDataFile = jsonDatabaseFile
     self.profile = {
         "name": "json-storage",
         "class": "permanent-storage",
         "verson": "1.0"
     }
     if os.path.isfile(self.jsonDataFile):
         self.entries = json.load(
             codecs.open(self.jsonDataFile, 'r', 'utf-8'))
     else:
         utils.safe_create_directory(os.path.dirname(self.jsonDataFile))
Пример #15
0
    def __init__(self, configDictionary):
        self.configuration = configDictionary
        self.security = BenderSecurity()
        logFile = utils.getKeyFromSectionInConfiguration('bender-core', 'bender_core_logfile', 'logs/bender-core.log', configDictionary)
        utils.safe_create_directory(os.path.dirname(logFile))
        logging.basicConfig(level=logging.WARNING, filename=logFile + '.libs.log', format=LOG_FORMAT)
        self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
        loggingFileHandler = logging.FileHandler(logFile)
        loggingFileHandler.setLevel(logging.INFO)
        loggingMemoryHandler = logging.handlers.MemoryHandler(128, target=loggingFileHandler)
        loggingMemoryHandler.setLevel(logging.INFO)
        loggingFormatter = logging.Formatter(LOG_FORMAT)
        self.logger.addHandler(loggingMemoryHandler)
        loggingFileHandler.setFormatter(loggingFormatter)
        loggingMemoryHandler.setFormatter(loggingFormatter)
        self.logger.setLevel(logging.INFO)
        self.logger.info('###################### STARTING A NEW BENDER INSTANCE #######################')
        self.logger.info('running %s' % ' '.join(sys.argv))
        self.interactive = int(utils.getKeyFromSectionInConfiguration('bender-core', 'interactive', 0, configDictionary))
        self.num_results = int(utils.getKeyFromSectionInConfiguration('bender-core', 'num_results', 1, configDictionary))
        self.use_hli = int(utils.getKeyFromSectionInConfiguration('bender-core', 'use_hli', 1, configDictionary))
        self.use_lookup = int(utils.getKeyFromSectionInConfiguration('bender-core', 'use_lookup', 1, configDictionary))
        self.name = utils.getKeyFromSectionInConfiguration('bender-core', 'name', 'Bender', configDictionary)
        self.personality = utils.getKeyFromSectionInConfiguration('bender-core', 'personality', 'mail-responder', configDictionary)
        self.lowerCL = float(utils.getKeyFromSectionInConfiguration('bender-core', 'lowerconfidence_level', 0.85, configDictionary))
        self.higherCL = float(utils.getKeyFromSectionInConfiguration('bender-core', 'higherconfidence_level', 0.95, configDictionary))
        self.reinforcementTimeout = int(utils.getKeyFromSectionInConfiguration('bender-core', 'reinforcement_timeout', 7 * 24 * 60 * 60, configDictionary))
        self.useSimilarity = int(utils.getKeyFromSectionInConfiguration('bender-core', 'use_similarity', 1, configDictionary))
        
        transient_storage = utils.getModulenameFromConfiguration('transientstorage', 'modules.storage.transientstorage.MLTransientStorage', configDictionary)
        permanent_storage = utils.getModulenameFromConfiguration('permanentstorage', 'modules.storage.permanentstorage.MLPermanentStorage', configDictionary)
        lookup = utils.getModulenameFromConfiguration('lookup', 'modules.lookup.mllookup.MLLookup', configDictionary)
        session = utils.getModulenameFromConfiguration('session', 'modules.session.mlsession.MLSession', configDictionary)
        nlp = utils.getModulenameFromConfiguration('nlp', 'modules.nlp.mlnlp.MLNLP', configDictionary)
        humanLogic = utils.getModulenameFromConfiguration('humanlogic', 'modules.humanlogic.mlhumanlogic.MLHumanLogic', configDictionary)
        concept = utils.getModulenameFromConfiguration('concept', 'modules.concept.mlconcept.MLConcept', configDictionary)
        stt = utils.getModulenameFromConfiguration('stt', 'modules.speech.mlstt.MLSTT', configDictionary)
        tts = utils.getModulenameFromConfiguration('tts', 'modules.speech.mltts.MLTTS', configDictionary)
        responseProcessor = utils.getModulenameFromConfiguration('response-postprocessor', 'modules.response.mlresponseproc.MLResponseProcessor', configDictionary)
        dataInfusor = utils.getModulenameFromConfiguration('datainfusor', 'modules.datainfusor.mldatainfusor.MLDataInfusor', configDictionary)
        similarity = utils.getModulenameFromConfiguration('similarity', 'module.similarity.mlsimilarity.MLSimilarity', configDictionary)
        indexedResponseProc = utils.getModulenameFromConfiguration('indexed-response-processor', 'module.response.mlidxresponseproc.MLIndexedResponseProcessor', configDictionary)
        brain = utils.getModulenameFromConfiguration('brain', 'modules.brain.mlbrain.MLBrain', configDictionary)
        spelling = utils.getModulenameFromConfiguration('spelling', 'modules.spelling.mlspelling.MLSpelling', configDictionary)

        utils.validate_module_class(transient_storage, MLTransientStorage)
        self.transientStorage = utils.initialize_class(transient_storage, configDictionary)

        utils.validate_module_class(permanent_storage, MLPermanentStorage)
        self.permanentStorage = utils.initialize_class(permanent_storage, configDictionary)

        utils.validate_module_class(lookup, MLLookup)
        self.lookup = utils.initialize_class(lookup, configDictionary)

        utils.validate_module_class(session, MLSession)
        self.session = utils.initialize_class(session, configDictionary)

        utils.validate_module_class(nlp, MLNLP)
        self.nlp = utils.initialize_class(nlp, configDictionary)

        utils.validate_module_class(humanLogic, MLHumanLogic)
        self.humanLogic = utils.initialize_class(humanLogic, configDictionary)
        self.humanLogicClass = humanLogic

        utils.validate_module_class(concept, MLConcept)
        self.concept = utils.initialize_class(concept, configDictionary)

        utils.validate_module_class(stt, MLSpeechToText)
        self.stt = utils.initialize_class(stt, configDictionary)

        utils.validate_module_class(tts, MLTextToSpeech)
        self.tts = utils.initialize_class(tts, configDictionary)

        utils.validate_module_class(dataInfusor, MLDataInfusor)
        self.dataInfusor = utils.initialize_class(dataInfusor, configDictionary)

        utils.validate_module_class(responseProcessor, MLResponseProcessor)
        self.responseProcessor = utils.initialize_class(responseProcessor, configDictionary)

        utils.validate_module_class(similarity, MLSimilarity)
        self.similarity = utils.initialize_class(similarity, configDictionary)

        utils.validate_module_class(indexedResponseProc, MLIndexedResponseProcessor)
        self.indexedResponseProcessor = utils.initialize_class(indexedResponseProc, configDictionary)

        utils.validate_module_class(brain, MLBrain)
        self.brain = utils.initialize_class(brain, configDictionary)

        utils.validate_module_class(spelling, MLSpelling)
        self.spelling = utils.initialize_class(spelling, configDictionary)

        self.machineLogic = MLMachineLogic(configDictionary)

        # NOTE: Even though we try to instantiate the classes in the right order
        # and try to call their 'initForBender' in the right order, you should NEVER, in your module
        # depend on any of the modules being already initialized in your implementation of these two
        # methods:
        #       __init__
        #       initForBender
        # Instead, you should request any other module instance only when you actually need them
        # during the processing of data
        #
        # The only exception is MLBrain that relies on concept and nlp being already initialized before
        # itself is called ... and this only during training...
        self.permanentStorage.initForBender(self)
        self.transientStorage.initForBender(self)
        self.nlp.initForBender(self)
        self.concept.initForBender(self)
        self.brain.initForBender(self)
        self.lookup.initForBender(self)
        self.session.initForBender(self)
        self.humanLogic.initForBender(self)
        self.stt.initForBender(self)
        self.tts.initForBender(self)
        self.dataInfusor.initForBender(self)
        self.responseProcessor.initForBender(self)
        self.similarity.initForBender(self)
        self.indexedResponseProcessor.initForBender(self)
        self.machineLogic.initForBender(self)
        self.spelling.initForBender(self)
        self.dataProviders = [] 
        dataproviders = utils.getSectionFromConfiguration('dataproviders', [ 'modules.dataproviders.mldataprovider.MLDataProvier' ], configDictionary)
        for dp in dataproviders:
            module = dp['module']
            utils.validate_module_class(module, MLDataProvider)
            newInstance = utils.initialize_class_with_config_section(module, dp, configDictionary)
            newInstance.initForBender(self)
            self.dataProviders.append(newInstance)
        self.dataExtractors = [] 
        dataextractors = utils.getSectionFromConfiguration('dataextractors', [ 'modules.dataextractors.mldataextrator.MLDataExtractor' ], configDictionary)
        for de in dataextractors:
            module = de['module']
            utils.validate_module_class(module, MLDataExtractor)
            newInstance = utils.initialize_class_with_config_section(module, de, configDictionary)
            newInstance.initForBender(self)
            self.dataExtractors.append(newInstance)
        self.internalMachineLogics = [] 
        internalMLs = utils.getSectionFromConfiguration('i-machinelogic', None, configDictionary)
        for iml in internalMLs:
            module = iml['module']
            utils.validate_module_class(module, MLInternalMachineLogic)
            newInstance = utils.initialize_class_with_config_section(module, iml, configDictionary)
            newInstance.initForBender(self)
            self.internalMachineLogics.append(newInstance)
        self.preprocessors = [] 
        preprocs = utils.getSectionFromConfiguration('preprocessors', None, configDictionary)
        for preproc in preprocs:
            module = preproc['module']
            utils.validate_module_class(module, MLRequestProcessor)
            newInstance = utils.initialize_class_with_config_section(module, preproc, configDictionary)
            newInstance.initForBender(self)
            self.preprocessors.append(newInstance)
        self.jobData = {}
        self.sessionData = {}
        self.configuration = configDictionary
        self.dataInfusor.setDataExtractorsAndProviders(self.dataExtractors, self.dataProviders)
        self.benderjob_logger = BenderJobLogger(configDictionary)
Пример #16
0
def create_annoy_index(moduleConfigSection, configDict, dictionary, corpus,
                       in_q, out_q):
    logger = logging.getLogger(os.path.basename(sys.argv[0]))
    module_config = moduleConfigSection
    config_dict = configDict
    num_topics_lsi = int(
        utils.getKeyFromSectionInConfiguration('bender-training',
                                               'num_topics_lsi', 200,
                                               configDict))
    output_path = module_config.get('annoy_data_path', '')
    accuracy = int(module_config.get('accuracy', 500))
    max_results = int(module_config.get('max_results', 100))
    utils.safe_create_directory(output_path)
    lsi_vectors_file = os.path.join(output_path, LSI_VECTORS_FNAME)
    lsi_model_filename = os.path.join(output_path, LSI_MODEL_FNAME)
    tfidf_vectors_file = os.path.join(output_path, TFIDF_MODEL_FNAME)
    annoy_output_filename = os.path.join(output_path, ANNOY_OUTPUT_FNAME)
    clipped_output_filename = os.path.join(output_path, CLIPPED_CORPUS_FNAME)

    tfidf = gensim.models.TfidfModel(corpus)
    logger.info('Saving Tfidf...')
    tfidf.save(tfidf_vectors_file)

    logger.info('*** START generating LSI...')
    lsi = gensim.models.LsiModel(tfidf[corpus],
                                 id2word=dictionary,
                                 num_topics=num_topics_lsi)
    logger.info('*** DONE generating LSI...')
    lsi.save(lsi_model_filename)
    logger.info('*** SAVED generating LSI...')
    # convert all articles to latent semantic space, store the result as a MatrixMarket file
    # normalize all vectors to unit length, to simulate cossim in libraries that only support euclidean distance
    gensim.corpora.MmCorpus.serialize(lsi_vectors_file,
                                      (gensim.matutils.unitvec(vec)
                                       for vec in lsi[tfidf[corpus]]))
    mm = gensim.corpora.MmCorpus(lsi_vectors_file)
    num_features, num_docs = mm.num_terms, min(mm.num_docs, MAX_DOCS)
    clipped = numpy.empty((num_docs, num_features), dtype=numpy.float32)
    for docno, doc in enumerate(itertools.islice(mm, num_docs)):
        clipped[docno] = gensim.matutils.sparse2full(doc, num_features)

    logger.info('*** Saving clipped corpus as NUMPY...')
    numpy.save(clipped_output_filename, clipped)

    logger.info('*** Generating ANNOY...')
    clipped_corpus = gensim.matutils.Dense2Corpus(clipped,
                                                  documents_columns=False)
    index_annoy = annoy.AnnoyIndex(num_features, metric='angular')
    for i, vec in enumerate(clipped_corpus):
        index_annoy.add_item(
            i,
            list(gensim.matutils.sparse2full(vec, num_features).astype(float)))
    logger.info('*** Building ANNOY...')
    index_annoy.build(accuracy)
    logger.info('*** Saving ANNOY...')
    index_annoy.save(annoy_output_filename)

    out_q.put('DONE')
    out_q.close()
    done = in_q.get()
    os._exit(0)
    return True
Пример #17
0
verbose = False
dump = False
for opt, arg in options:
    if opt == '-h':
        print('No Help yet')
        sys.exit(0)
    elif opt in ('-v', '--version'):
        print("V1.0beta")
        sys.exit(0)
    elif opt in ('-c', '--config'):
        configFile = arg
    elif opt in ('-V', '--verbose'):
        verbose = True
    elif opt in ('-d', '--dump'):
        dump = True
print("============================ Bender Trainer ==============================")
print("           Bender 2.0a - Copyright (c) 2019 Imdat Solak")
print("           Written by: Imdat Solak ([email protected])")
print("=========================================================================")

benderConfig = utils.getBenderConfiguration(configFile, verbose, False)
logger = logging.getLogger(os.path.basename(sys.argv[0]))
logFile = utils.getKeyFromSectionInConfiguration('bender-training', 'train_log_file', 'logs/bender-train.log', benderConfig)
utils.safe_create_directory(os.path.dirname(logFile))
logging.basicConfig(filename=logFile, format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info('**************** NEW TRAINING SESSION STARTED: %s' % ' '.join(sys.argv))
tdc = TrainingDataConverter(benderConfig)
tdc.train()