def generate(self, message: str, doc: Doc = None, ignore_topics: List[str] = []) -> Optional[str]: if doc is None: filtered_message = MarkovFilters.filter_input(message) doc = self._nlp(filtered_message) subjects = [] for token in doc: if (token.text in ignore_topics): continue markov_word = self._markov_model.select(token.text) if markov_word is not None: subjects.append(markov_word) if len(subjects) == 0: UNHEARD_LIST = [ "Didn’t catch that", "Try again", "Are you even trying", "That might be too much for me right now", "I’ll learn how eventually", "I don't know how to respond to that yet" ] UNHEARD_RESPONSE = random.choice(UNHEARD_LIST) return UNHEARD_RESPONSE def structure_generator(): sentence_stats_manager = InputTextStatManager() while True: choices, p_values = sentence_stats_manager.probabilities() if len(choices) > 0: num_sentences = np.random.choice(choices, p=p_values) else: num_sentences = np.random.randint(1, 5) yield self._structure_scheduler.predict( num_sentences=num_sentences) generator = MarkovGenerator(structure_generator=structure_generator(), subjects=subjects) reply_words = [] sentences = generator.generate(db=self._markov_model) if sentences is None: MISUNDERSTOOD_LIST = [ 'Huh.', 'Huh', 'Huh!', 'Huh?', 'Huh!?', 'HUH?' ] MISUNDERSTOOD_REPONSE = random.choice(MISUNDERSTOOD_LIST) return MISUNDERSTOOD_REPONSE for sentence in sentences: for word_idx, word in enumerate(sentence): if not word.compound: text = CapitalizationMode.transform(word.mode, word.text) else: text = word.text reply_words.append(text) reply = " ".join(reply_words) filtered_reply = MarkovFilters.smooth_output(reply) return filtered_reply
def from_token(token: Token) -> 'MarkovWord': if CapitalizationMode.from_token( token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={})
def from_token(token: Token) -> 'MarkovNeighbor': key = token.text.lower() text = token.text if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False pos = Pos.from_token(token) values = [0, 0] dist = [0] * (MARKOV_WINDOW_SIZE * 2 + 1) return MarkovNeighbor(key, text, pos, compound, values, dist)
def preprocess(self, doc: Doc) -> bool: if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False sequence = [] previous_item = None for sentence_idx, sentence in enumerate(doc.sents): if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False for token_idx, token in enumerate(sentence): item = StructureFeatureAnalyzer.analyze( token, CapitalizationMode.from_token( token, CAPITALIZATION_COMPOUND_RULES)) label = item if len(sequence) == 0: # Offset data by one, making label point to the next data item sequence.append( PoSCapitalizationMode( Pos.NONE, CapitalizationMode.NONE).to_embedding()) else: sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item # Handle EOS after each sentence item = PoSCapitalizationMode( Pos.EOS, CapitalizationMode.NONE).to_embedding() label = item sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item return True
def main(): np.random.seed(int(time.time())) markov_db = MarkovTrieDb(MARKOV_DB_PATH) structure_model = StructureModelScheduler(use_gpu=USE_GPU) structure_model.start() structure_model.load(STRUCTURE_MODEL_PATH) subjects = [] for word in ['Some', 'Words', 'Here']: select_word = markov_db.select(word) if select_word is not None: subjects.append(select_word) else: print("Couldn't select %s" % word) for i in range(0, 1000): def structure_generator(): while True: yield structure_model.predict(num_sentences=1) markov_generator = MarkovGenerator(structure_generator(), subjects) words = [] sentences = markov_generator.generate(markov_db) if sentences is None: continue for sentence_idx, sentence in enumerate(sentences): pos_list = [word.pos for word in sentence] for word_idx, word in enumerate(sentence): if not word.compound: text = CapitalizationMode.transform( word.mode, sentences[sentence_idx][word_idx].text, ) else: text = word.text words.append(text) message = " ".join(words) message = MarkovFilters.smooth_output(message) print(message)
def generate(self, message: str, doc: Doc = None, ignore_topics: List[str] = []) -> Optional[str]: if doc is None: filtered_message = MarkovFilters.filter_input(message) doc = self._nlp(filtered_message) subjects = [] for token in doc: if(token.text in ignore_topics): continue markov_word = self._markov_model.select(token.text) if markov_word is not None: subjects.append(markov_word) if len(subjects) == 0: return "I wasn't trained on that!" def structure_generator(): sentence_stats_manager = InputTextStatManager() while True: choices, p_values = sentence_stats_manager.probabilities() if len(choices) > 0: num_sentences = np.random.choice(choices, p=p_values) else: num_sentences = np.random.randint(1, 5) yield self._structure_scheduler.predict(num_sentences=num_sentences) generator = MarkovGenerator(structure_generator=structure_generator(), subjects=subjects) reply_words = [] sentences = generator.generate(db=self._markov_model) if sentences is None: return "Huh?" for sentence in sentences: for word_idx, word in enumerate(sentence): if not word.compound: text = CapitalizationMode.transform(word.mode, word.text) else: text = word.text reply_words.append(text) reply = " ".join(reply_words) filtered_reply = MarkovFilters.smooth_output(reply) return filtered_reply
def main(): np.random.seed(int(time.time())) markov_db = MarkovTrieDb(MARKOV_DB_PATH) structure_model = StructureModelScheduler(use_gpu=USE_GPU) structure_model.start() structure_model.load(STRUCTURE_MODEL_PATH) subjects = [] for word in ['Some', 'Words', 'Here']: select_word = markov_db.select(word) if select_word is not None: subjects.append(select_word) else: print("Couldn't select %s" % word) for i in range(0, 1000): def structure_generator(): while True: yield structure_model.predict(num_sentences=1) markov_generator = MarkovGenerator(structure_generator(), subjects) words = [] sentences = markov_generator.generate(markov_db) if sentences is None: continue for sentence_idx, sentence in enumerate(sentences): pos_list = [word.pos for word in sentence] for word_idx, word in enumerate(sentence): if not word.compound: text = CapitalizationMode.transform(word.mode, sentences[sentence_idx][word_idx].text, ) else: text = word.text words.append(text) message = " ".join(words) message = MarkovFilters.smooth_output(message) print(message)
def preprocess(self, doc: Doc) -> bool: if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False sequence = [] previous_item = None for sentence_idx, sentence in enumerate(doc.sents): if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False for token_idx, token in enumerate(sentence): item = StructureFeatureAnalyzer.analyze( token, CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES)) label = item if len(sequence) == 0: # Offset data by one, making label point to the next data item sequence.append(PoSCapitalizationMode(Pos.NONE, CapitalizationMode.NONE).to_embedding()) else: sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item # Handle EOS after each sentence item = PoSCapitalizationMode(Pos.EOS, CapitalizationMode.NONE).to_embedding() label = item sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item return True
def from_token(token: Token) -> 'MarkovWord': if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={})
def analyze(token: Token, mode: CapitalizationMode): pos = Pos.from_token(token) mode = PoSCapitalizationMode(pos, mode) return mode.to_embedding()
def from_embedding(embedding: int): pos_part = int(embedding / len(CapitalizationMode)) mode_part = int(embedding % len(CapitalizationMode)) return PoSCapitalizationMode(Pos(pos_part), CapitalizationMode(mode_part))