def load_yaml_data(self, yaml_data, constants, text_utils): """ Из YAML конфига загружаются правила трансляции. Одно правило в простейшем случае описывает ситуацию, когда одна входная фраза преобразуется в новую фразу. Например, входная фраза: - Ты должен назвать свое имя. будет преобразована в - Как тебя зовут? В общем случае, одно правило может задавать несколько альтернативных входных фраз, чтобы компактно задавать их трансляцию. """ if 'comprehensions' in yaml_data: for rule in yaml_data['comprehensions']: result_phrase = replace_constant(rule['rule']['then'], constants, text_utils) conditions = ComprehensionTable.__get_node_list( rule['rule']['if']) for input_phrase in conditions: input_phrase = replace_constant(input_phrase, constants, text_utils) self.templates.append((result_phrase, input_phrase)) self.order2anchor[input_phrase] = result_phrase
def from_yaml(yaml_node, constants, text_utils): actor = ActorGenerate() if isinstance(yaml_node, dict): # Расширенный формат. for inner_keyword in yaml_node.keys(): if 'templates' == inner_keyword: for template in yaml_node['templates']: actor.templates.append( replace_constant(template, constants, text_utils)) elif 'template' == inner_keyword: actor.templates.append( replace_constant(yaml_node[inner_keyword], constants, text_utils)) elif 'wordbag_question' == inner_keyword: actor.wordbag_questions.append( replace_constant(yaml_node[inner_keyword], constants, text_utils)) elif 'wordbag_word' == inner_keyword: actor.wordbag_words.append( replace_constant(yaml_node[inner_keyword], constants, text_utils)) else: raise NotImplementedError() elif isinstance(yaml_node, str): raise NotImplementedError() return actor
def load(self, rule_paths, model_folder, data_folder, constants, text_utils): for yaml_path in rule_paths: logging.info( 'Loading NoInformationModel replicas and rules from "%s"', yaml_path) with io.open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if 'no_relevant_information' in data: for s in data['no_relevant_information']['phrases']: self.no_info_replicas.append( replace_constant(s, constants, text_utils)) if 'unknown_order' in data: for s in data['unknown_order']: self.unknown_order.append( replace_constant(s, constants, text_utils)) if 'no_relevant_information' in data: if 'rules' in data['no_relevant_information']: for rule_yaml in data['no_relevant_information'][ 'rules']: rule = ScriptingRule.from_yaml( rule_yaml['rule'], constants, text_utils) self.rules.append(rule) logging.info('NoInformationModel loaded: %d phrase(s), %d rule(s)', len(self.no_info_replicas), len(self.rules))
def from_yaml(yaml_node, constants, text_utils): actor = ActorSay() # TODO: сделать расширенную диагностику ошибок описания!!! # Надо понять, тут расширенная форма описания актора или просто список реплик, возможно # из одного элемента. if isinstance(yaml_node, dict): # Расширенный формат. for inner_keyword in yaml_node.keys(): if 'phrases' == inner_keyword: for utterance in yaml_node['phrases']: actor.phrases.append(replace_constant(utterance, constants, text_utils)) elif 'exhausted' == inner_keyword: for utterance in yaml_node['exhausted']: actor.exhausted_phrases.append(replace_constant(utterance, constants, text_utils)) elif 'known_answer' == inner_keyword: actor.known_answer_policy = yaml_node[inner_keyword] # TODO - проверить значение флага: 'skip' | 'utter' else: raise NotImplementedError() elif isinstance(yaml_node, list): for utterance in yaml_node: if isinstance(utterance, str): actor.phrases.append(replace_constant(utterance, constants, text_utils)) else: raise SyntaxError() elif isinstance(yaml_node, str): actor.phrases.append(replace_constant(yaml_node, constants, text_utils)) return actor
def __load_entries(self): if not self.loaded: self.loaded = True self.logger.info(u'Start loading QA entries from "%s"', self.path) with io.open(self.path, 'r', encoding='utf-8') as rdr: for line in rdr: line = line.strip() if len(line) > 0: if line[0] == u'#': # строки с комментариями начинаются с # continue elif line.startswith(u'Q:'): # Может быть один или несколько вариантов вопросов для одного ответа. # Строки вопросов начинаются с паттерна "Q:" alt_questions = [] question = line.replace(u'Q:', u'').strip() assert(len(question) > 0) alt_questions.append(question) answer_lines = [] for line in rdr: if line.startswith(u'Q:'): question = line.replace(u'Q:', u'').strip() question = replace_constant(question, self.constants, self.text_utils) assert (len(question) > 0) alt_questions.append(question) else: answer_lines.append(line.replace(u'A:', u'').strip()) break # Теперь считываем все строки до первой пустой, считая # их строками ответа for line2 in rdr: line2 = line2.strip() if len(line2) == 0: break else: answer_lines.append(line2.replace(u'A:', u'').strip()) answer = u' '.join(answer_lines) answer = replace_constant(answer, self.constants, self.text_utils) assert(len(answer) > 0) if answer.startswith('---'): # для удобства отладки демо-faq'ов, где ответы прописаны как -------- answer = u'<<<<dummy answer for>>> ' + question for question in alt_questions: self.questions.append(question) self.answers.append(answer) self.logger.info(u'{} QA entries loaded from {}'.format(len(self.questions), self.path))
def load_yaml(self, yaml_root, smalltalk_rule2grammar, constants, text_utils): """ Загружаем список правил из yaml файла. yaml_root должен указывать на узел "smalltalk_rules". """ for rule in yaml_root: condition = rule['rule']['if'] action = rule['rule']['then'] # Простые правила, которые задают срабатывание по тексту фразы, добавляем в отдельный # список, чтобы обрабатывать в модели синонимичности одним пакетом. if 'text' in condition and len(condition) == 1: for condition1 in SmalltalkRules.__get_node_list(condition['text']): rule_condition = SmalltalkTextCondition(replace_constant(condition1, constants, text_utils)) if 'say' in action: rule = SmalltalkSayingRule(rule_condition) for answer1 in SmalltalkRules.__get_node_list(action['say']): rule.add_answer(replace_constant(answer1, constants, text_utils)) self.text_rules.append(rule) elif 'generate' in action: generative_templates = list(SmalltalkRules.__get_node_list(action['generate'])) rule = SmalltalkGeneratorRule(rule_condition, generative_templates) key = rule_condition.get_key() if key in smalltalk_rule2grammar: rule.compiled_grammar = smalltalk_rule2grammar[key] else: logging.error(u'Missing compiled grammar for rule %s', key) self.text_rules.append(rule) else: logging.error(u'"%s" statement is not implemented', action) raise NotImplementedError() else: rule_condition = SmalltalkComplexCondition(condition, constants, text_utils) if 'generate' in action: generative_templates = list(SmalltalkRules.__get_node_list(action['generate'])) rule = SmalltalkGeneratorRule(rule_condition, generative_templates) key = rule_condition.get_key() if key in smalltalk_rule2grammar: rule.compiled_grammar = smalltalk_rule2grammar[key] else: logging.error(u'Missing compiled grammar for rule "%s"', key) self.complex_rules.append(rule) elif 'say' in action: rule = SmalltalkSayingRule(rule_condition) for answer1 in SmalltalkRules.__get_node_list(action['say']): rule.add_answer(replace_constant(answer1, constants, text_utils)) self.complex_rules.append(rule) else: raise NotImplementedError()
def load_rules(self, yaml_path, compiled_grammars_path, constants, text_utils): with io.open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if 'greeting' in data: self.greetings = [] for s in data['greeting']: self.greetings.append(replace_constant(s, constants, text_utils)) if 'goodbye' in data: self.goodbyes = [] for s in data['goodbye']: self.goodbyes.append(replace_constant(s, constants, text_utils)) if 'forms' in data: for form_node in data['forms']: form = VerbalForm.from_yaml(form_node['form'], constants, text_utils) self.forms.append(form) # Для smalltalk-правил нужны скомпилированные генеративные грамматики. smalltalk_rule2grammar = dict() with open(compiled_grammars_path, 'rb') as f: n_rules = pickle.load(f) for _ in range(n_rules): key = pickle.load(f) grammar = GenerativeGrammarEngine.unpickle_from(f) grammar.set_dictionaries(text_utils.gg_dictionaries) smalltalk_rule2grammar[key] = grammar if 'scenarios' in data: for scenario_node in data['scenarios']: scenario = Scenario.load_yaml(scenario_node['scenario'], smalltalk_rule2grammar, constants, text_utils) self.scenarios.append(scenario) if 'story_rules' in data: self.load_story_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) # INSTEAD-OF правила if 'rules' in data: self.load_instead_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) if 'smalltalk_rules' in data: self.smalltalk_rules.load_yaml(data['smalltalk_rules'], smalltalk_rule2grammar, constants, text_utils) self.comprehension_rules = ComprehensionTable() self.comprehension_rules.load_yaml_data(data, constants, text_utils) self.common_phrases = [] if 'common_phrases' in data: for common_phrase in data['common_phrases']: common_phrase = replace_constant(common_phrase, constants, text_utils) self.common_phrases.append(common_phrase)
def __init__(self, data_yaml, constants, text_utils): super().__init__(data_yaml) self.metric = 'synonymy' self.threshold = None self.modality = None if 'text' in data_yaml: y = data_yaml['text'] elif 'assertion' in data_yaml: y = data_yaml['assertion'] self.modality = 'assertion' elif 'question' in data_yaml: y = data_yaml['question'] self.modality = 'question' else: raise NotImplementedError() if isinstance(y, dict): etalons = y['masks'] self.metric = y.get('metric', 'synonymy') self.threshold = y.get('threshold') elif isinstance(y, list): etalons = y else: etalons = [y] self.etalons = [] for e in etalons: self.etalons.append(replace_constant(e, constants, text_utils))
def __init__(self, mask_str, constants, text_utils): self.mask_terms = [] mask_str = replace_constant(mask_str, constants, text_utils) mask_tokens = text_utils.tokenizer.tokenize(mask_str) mask_tags = list(text_utils.postagger.tag(mask_tokens)) mask_lemmas = text_utils.lemmatizer.lemmatize(mask_tags) for token, tags, lemma in zip(mask_tokens, mask_tags, mask_lemmas): term = MaskTerm() term.word = token term.norm_word = token.lower() term.lemma = lemma[2] if token.startswith('np'): term.chunk_type = 'NP' term.chunk_name = token elif token.startswith('vi'): term.chunk_type = 'VI' term.chunk_name = token elif token.startswith('vp'): term.chunk_type = 'VP' term.chunk_name = token elif token.startswith('ap'): term.chunk_type = 'AP' term.chunk_name = token self.mask_terms.append(term)
def load_profile(self): logger = logging.getLogger('ProfileFactsReader') if self.profile_facts is None: logger.info(u'Loading profile facts from "%s"', self.profile_path) self.profile_facts = [] with io.open(self.profile_path, 'r', encoding='utf=8') as rdr: current_section = None for line in rdr: line = line.strip() if line: if line.startswith('#'): if line.startswith('##'): current_section = line[line.index(':') + 1:].strip() if current_section not in ('1s', '2s', '3'): msg = u'Unknown profile section {}'.format( current_section) raise RuntimeError(msg) else: # Строки с одним # считаем комментариями. continue else: assert (current_section) canonized_line = self.text_utils.canonize_text( line) canonized_line = replace_constant( canonized_line, self.constants, self.text_utils) self.profile_facts.append( (canonized_line, current_section, u'')) logger.debug(u'%d facts loaded from "%s"', len(self.profile_facts), self.profile_path)
def load_yaml(self, yaml_node, constants, text_utils): if 'expiration' in yaml_node: self.expiration = int(yaml_node['expiration']) if 'can_answer_question' in yaml_node: self.can_answer_question = replace_constant( yaml_node['can_answer_question'], constants, text_utils) if 'exit_phrases' in yaml_node: self.exit_phrases = list(yaml_node['exit_phrases'])
def from_yaml(yaml_node, constants, text_utils): actor = ContinuationAction() if isinstance(yaml_node, list): for utterance in yaml_node: if isinstance(utterance, str): s = replace_constant(utterance, constants, text_utils) actor.phrases.append(SayingPhrase(s)) else: raise SyntaxError() elif isinstance(yaml_node, str): s = replace_constant(yaml_node, constants, text_utils) actor.phrases.append(SayingPhrase(s)) else: raise NotImplementedError() return actor
def __init__(self, data_yaml, constants, text_utils): super().__init__(data_yaml) if isinstance(data_yaml[u'raw_text'], list): etalons = data_yaml[u'raw_text'] else: etalons = [data_yaml[u'raw_text']] self.etalons = [] for e in etalons: self.etalons.append(replace_constant(e, constants, text_utils))
def load_profile(self): logger = logging.getLogger('ProfileFactsReader') if self.profile_facts is None: logger.info('Loading profile facts from "%s"', self.profile_path) self.profile_facts = [] with io.open(self.profile_path, 'r', encoding='utf=8') as rdr: current_section = None for line in rdr: line = line.strip() if line: if line.startswith('#'): if line.startswith('##'): if 'profile_section:' in line: # Задается раздел баз знаний current_section = line[line.index(':')+1:].strip() if current_section not in ('1s', '2s', '3'): msg = 'Unknown profile section {}'.format(current_section) raise RuntimeError(msg) elif 'import' in line: # Читаем факты из дополнительного файла fn = re.search('import "(.+)"', line).group(1).strip() add_path = os.path.join(os.path.dirname(self.profile_path), fn) logger.debug('Loading facts from file "%s"...', add_path) with io.open(add_path, 'rt', encoding='utf-8') as rdr2: for line in rdr2: line = line.strip() if line and not line.startswith('#'): line1 = random.choice(line.split('|')).strip() canonized_line = self.text_utils.canonize_text(line1) canonized_line = replace_constant(canonized_line, self.constants, self.text_utils) self.profile_facts.append((canonized_line, current_section, add_path)) else: # Строки с одним # считаем комментариями. continue else: assert(current_section) line1 = random.choice(line.split('|')).strip() canonized_line = self.text_utils.canonize_text(line1) canonized_line = replace_constant(canonized_line, self.constants, self.text_utils) self.profile_facts.append((canonized_line, current_section, self.profile_path)) logger.debug('%d facts loaded from "%s"', len(self.profile_facts), self.profile_path)
def __init__(self, data_yaml, constants, text_utils): super().__init__(data_yaml) if isinstance(data_yaml[u'keyword'], list): etalons = data_yaml[u'keyword'] else: etalons = [data_yaml[u'keyword']] self.matchers = [KeywordMatcher.from_string(replace_constant(etalon, constants, text_utils)) for etalon in etalons]
def __init__(self, data_yaml, constants, text_utils): super().__init__(data_yaml) # Проверяем, что последняя фраза бота была похожа на указанный эталон if isinstance(data_yaml[u'prev_bot_text'], list): etalons = data_yaml[u'prev_bot_text'] else: etalons = [data_yaml[u'prev_bot_text']] self.etalons = [] for e in etalons: self.etalons.append(replace_constant(e, constants, text_utils))
def choice(self, bot, session, text_utils, phrases0): # Подстановки констант (имя бота etc) phrases = [ replace_constant(phrase, bot.profile.constants, text_utils) for phrase in phrases0 ] if len(phrases) == 1: return phrases[0] phrases2 = list( filter(lambda z: session.count_bot_phrase(z) == 0, phrases)) if len(phrases2) == 0: phrases2 = phrases return random.choice(phrases2)
def __init__(self, data_yaml, constants, text_utils): super().__init__(data_yaml) self.metric = 'synonymy' self.threshold = None y = data_yaml[u'text'] if isinstance(y, dict): etalons = y['masks'] self.metric = y.get('metric', 'synonymy') self.threshold = y.get('threshold') elif isinstance(y, list): etalons = y else: etalons = [data_yaml[u'text']] self.etalons = [] for e in etalons: self.etalons.append(replace_constant(e, constants, text_utils))
def from_yaml(yaml_node, constants, text_utils): actor = ActorAnswer() # TODO: сделать расширенную диагностику ошибок описания!!! # Надо понять, тут расширенная форма описания актора или просто список реплик, возможно # из одного элемента. if isinstance(yaml_node, dict): # Расширенный формат. for inner_keyword in yaml_node.keys(): if 'question' == inner_keyword: actor.question = replace_constant(yaml_node['question'], constants, text_utils) elif 'output' == inner_keyword: actor.output = yaml_node[inner_keyword] # TODO: проверять что значение 'premise' else: raise NotImplementedError() elif isinstance(yaml_node, str): actor.question = yaml_node else: raise NotImplementedError() return actor
def from_yaml(yaml_node, constants, text_utils): form = VerbalForm() form.name = yaml_node['name'] form.ok_action = yaml_node['action'] form.compiled_ok_action = ActorBase.from_yaml(form.ok_action, constants, text_utils) if 'fields' in yaml_node: for field_node in yaml_node['fields']: field = VerbalFormField() field_node = field_node['field'] field.name = field_node['name'] field.question = replace_constant(field_node['question'], constants, text_utils) if 'from_entity' in field_node: field.from_entity = field_node['from_entity'] field.source = 'entity' elif 'from_reflection' in field_node: field.from_reflection = field_node['from_reflection'] field.source = 'reflection' elif 'source' in field_node: field.source = field_node['source'] if field.source not in ('raw_response', 'entity', 'reflection'): logging.error(u'Unknown field source "%s"', field.source) raise RuntimeError() else: # TODO: сделать внятную диагностику raise NotImplementedError() form.fields.append(field) # TODO: сделать загрузку instead-of и smalltalk правил, которые будут проверяться при работе форму. #self.insteadof_rules return form
def __init__(self, data_yaml, constants, text_utils): super().__init__(data_yaml) self.rx_str = replace_constant(data_yaml['regex'], constants, text_utils)
def load_rules(self, yaml_path, compiled_grammars_path, constants, text_utils): logging.debug('Loading rules from "%s"...', yaml_path) self.rule_paths.append(yaml_path) with io.open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if 'greeting' in data: for s in data['greeting']: self.greetings.append( replace_constant(s, constants, text_utils)) if 'confirmations' in data: for s in data['confirmations']: self.confirmations.append( replace_constant(s, constants, text_utils)) if 'negations' in data: for s in data['negations']: self.negations.append( replace_constant(s, constants, text_utils)) if 'goodbye' in data: for s in data['goodbye']: self.goodbyes.append( replace_constant(s, constants, text_utils)) if 'forms' in data: for form_node in data['forms']: form = VerbalForm.from_yaml(form_node['form'], constants, text_utils) self.forms.append(form) # Для smalltalk-правил нужны скомпилированные генеративные грамматики. smalltalk_rule2grammar = dict() #with open(compiled_grammars_path, 'rb') as f: # n_rules = pickle.load(f) # for _ in range(n_rules): # key = pickle.load(f) # grammar = GenerativeGrammarEngine.unpickle_from(f) # grammar.set_dictionaries(text_utils.gg_dictionaries) # smalltalk_rule2grammar[key] = grammar if 'story_rules' in data: self.load_story_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) # Правила, которые отрабатывают приоритетно на первой реплике собеседника в сессии if 'first_reply_rules' in data: self.load_first_reply_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) # INSTEAD-OF правила if 'rules' in data: self.load_instead_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) # AFTER правила (например, запуск дополнительных сценариев по ключевым словам) if 'after_rules' in data: self.load_after_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) if 'smalltalk_rules' in data: self.smalltalk_rules.load_yaml(data['smalltalk_rules'], smalltalk_rule2grammar, constants, text_utils) if 'scenarios' in data: for scenario_node in data['scenarios']: scenario = Scenario.load_yaml(scenario_node['scenario'], self, smalltalk_rule2grammar, constants, text_utils) self.scenarios.append(scenario) if 'continuation' in data: self.continuation_rules.load_yaml(data['continuation'], constants, text_utils) if 'files' in data['continuation']: for fname in data['continuation']['files']: with io.open(os.path.join(os.path.dirname(yaml_path), fname), 'r', encoding='utf-8') as f: data2 = yaml.safe_load(f) self.continuation_rules.load_yaml( data2, constants, text_utils) self.comprehension_rules.load_yaml_data(data, constants, text_utils) if 'common_phrases' in data: for common_phrase in data['common_phrases']: common_phrase = replace_constant(common_phrase, constants, text_utils) self.common_phrases.append(common_phrase) if 'import' in data: for import_filename in data['import']: add_path = os.path.join(os.path.dirname(yaml_path), import_filename) self.load_rules(add_path, compiled_grammars_path, constants, text_utils)
def load_rules(self, yaml_path, compiled_grammars_path, constants, text_utils): with io.open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if 'greeting' in data: self.greetings = [] for s in data['greeting']: self.greetings.append(replace_constant(s, constants, text_utils)) if 'goodbye' in data: self.goodbyes = [] for s in data['goodbye']: self.goodbyes.append(replace_constant(s, constants, text_utils)) if 'forms' in data: for form_node in data['forms']: form = VerbalForm.from_yaml(form_node['form'], constants, text_utils) self.forms.append(form) # Для smalltalk-правил нужны скомпилированные генеративные грамматики. smalltalk_rule2grammar = dict() #with open(compiled_grammars_path, 'rb') as f: # n_rules = pickle.load(f) # for _ in range(n_rules): # key = pickle.load(f) # grammar = GenerativeGrammarEngine.unpickle_from(f) # grammar.set_dictionaries(text_utils.gg_dictionaries) # smalltalk_rule2grammar[key] = grammar if 'story_rules' in data: self.load_story_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) # INSTEAD-OF правила if 'rules' in data: self.load_instead_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) # AFTER правила (например, запуск дополнительных сценариев по ключевым словам) if 'after_rules' in data: self.load_after_rules(os.path.dirname(yaml_path), data, compiled_grammars_path, constants, text_utils) if 'smalltalk_rules' in data: self.smalltalk_rules.load_yaml(data['smalltalk_rules'], smalltalk_rule2grammar, constants, text_utils) if 'scenarios' in data: for scenario_node in data['scenarios']: scenario = Scenario.load_yaml(scenario_node['scenario'], self, smalltalk_rule2grammar, constants, text_utils) self.scenarios.append(scenario) self.continuation_rules = ContinuationRules() if 'continuation' in data: self.continuation_rules.load_yaml(data['continuation'], constants, text_utils) if 'files' in data['continuation']: for fname in data['continuation']['files']: with io.open(os.path.join(os.path.dirname(yaml_path), fname), 'r', encoding='utf-8') as f: data2 = yaml.safe_load(f) self.continuation_rules.load_yaml(data2, constants, text_utils) self.comprehension_rules = ComprehensionTable() self.comprehension_rules.load_yaml_data(data, constants, text_utils) self.common_phrases = [] if 'common_phrases' in data: for common_phrase in data['common_phrases']: common_phrase = replace_constant(common_phrase, constants, text_utils) self.common_phrases.append(common_phrase)
def from_yaml(yaml_node, constants, text_utils): actor = ActorQueryDb() s = replace_constant(yaml_node, constants, text_utils) actor.query_template = SayingPhrase(s) return actor