Exemplo n.º 1
0
def read_nlu_data():
    try:
        cache_dir = sh.ls(FLAGS.cache_dir)
        if 'id2class.set' in cache_dir and 'intent_examples.set' in cache_dir:
            id2class_path = os.path.join(FLAGS.cache_dir, 'id2class.set')
            id2class_lock_path = id2class_path + '.lock'
            intent_examples_path = os.path.join(FLAGS.cache_dir,
                                                'intent_examples.set')
            intent_examples_lock_path = intent_examples_path + '.lock'

            with FileLock(id2class_lock_path):
                id2class = torch.load(id2class_path)

            with FileLock(intent_examples_lock_path):
                intent_examples = torch.load(intent_examples_path)

            return id2class, intent_examples
    except Exception as e:
        logging.error(e)
        sh.mkdir(FLAGS.cache_dir)

    data = load_data(FLAGS.data_dir, 'zh')
    id2class = dict(enumerate(data.intents))
    intent_examples = data.intent_examples

    torch.save(id2class, os.path.join(FLAGS.cache_dir, 'id2class.set'))
    torch.save(intent_examples,
               os.path.join(FLAGS.cache_dir, 'intent_examples.set'))

    return id2class, intent_examples
Exemplo n.º 2
0
def training_data_from_paths(paths: Iterable[Text],
                             language: Text) -> TrainingData:
    from rasa.nlu.training_data import loading

    training_data_sets = [
        loading.load_data(nlu_file, language) for nlu_file in paths
    ]
    return TrainingData().merge(*training_data_sets)
Exemplo n.º 3
0
    def _read_nlu_data(self):
        try:
            cache_dir = sh.ls(FLAGS.cache_dir)
            if 'id2entity.set' in cache_dir and 'entity_examples.set' in cache_dir and 'id2class.set' in cache_dir and 'intent_examples.set' in cache_dir:
                id2entity_path = os.path.join(FLAGS.cache_dir, 'id2entity.set')
                id2entity_lock_path = id2entity_path + '.lock'

                entity_examples_path = os.path.join(FLAGS.cache_dir, 'entity_examples.set')
                entity_examples_lock_path = entity_examples_path + '.lock'

                id2class_path = os.path.join(FLAGS.cache_dir, 'id2class.set')
                id2class_lock_path = id2class_path + '.lock'

                intent_examples_path = os.path.join(FLAGS.cache_dir, 'intent_examples.set')
                intent_examples_lock_path = intent_examples_path + '.lock'
                
                with FileLock(id2entity_lock_path):
                    id2entity = torch.load(id2entity_path)

                with FileLock(entity_examples_lock_path):
                    entity_examples = torch.load(entity_examples_path)

                with FileLock(id2class_lock_path):
                    id2class = torch.load(id2class_path)

                with FileLock(intent_examples_lock_path):
                    intent_examples = torch.load(intent_examples_path)
                
                return id2entity, entity_examples, id2class, intent_examples
        except Exception as e:
            logging.error(e)
            sh.mkdir(FLAGS.cache_dir)
        
        data = load_data(FLAGS.data_dir, 'zh')
        entity_lists, entity_examples_cooked, intent_examples = ['O'], [], []

        for item in data.training_examples:
            training_text = item.text
            training_data = item.data

            entity_examples_cooked.append(self._predata(training_text, training_data.get("entities", [])))
            intent_examples.append(training_data.get("intent", None))

        for entity in data.entities:
            for tag in ['B', 'I']:
                entity_lists.append(tag + '-' + entity)

        id2entity = dict(enumerate(entity_lists))
        id2class = dict(enumerate(data.intents))

        torch.save(id2entity, os.path.join(FLAGS.cache_dir, 'id2entity.set'))
        torch.save(entity_examples_cooked, os.path.join(FLAGS.cache_dir, 'entity_examples.set'))

        torch.save(id2class, os.path.join(FLAGS.cache_dir, 'id2class.set'))
        torch.save(intent_examples, os.path.join(FLAGS.cache_dir, 'intent_examples.set'))

        return id2entity, entity_examples_cooked, id2class, intent_examples
Exemplo n.º 4
0
def training_data_from_paths(paths: Iterable[Text],
                             language: Text) -> TrainingData:
    from rasa.nlu.training_data import loading

    training_datas = [
        loading.load_data(nlu_file, language) for nlu_file in paths
    ]
    merged_training_data = TrainingData().merge(*training_datas)
    merged_training_data.fill_response_phrases()
    return merged_training_data
Exemplo n.º 5
0
def split_nlu_data(args):
    from rasa.nlu.training_data.loading import load_data

    data_path = get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH)
    data_path = data.get_nlu_directory(data_path)
    nlu_data = load_data(data_path)
    train, test = nlu_data.train_test_split(args.training_fraction)

    train.persist(args.out, filename="training_data.json")
    test.persist(args.out, filename="test_data.json")
Exemplo n.º 6
0
Arquivo: data.py Projeto: sysang/rasa
def split_nlu_data(args: argparse.Namespace) -> None:
    from rasa.nlu.training_data.loading import load_data
    from rasa.nlu.training_data.util import get_file_format

    data_path = rasa.cli.utils.get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH)
    data_path = data.get_nlu_directory(data_path)

    nlu_data = load_data(data_path)
    fformat = get_file_format(data_path)

    train, test = nlu_data.train_test_split(args.training_fraction, args.random_seed)

    train.persist(args.out, filename=f"training_data.{fformat}")
    test.persist(args.out, filename=f"test_data.{fformat}")
Exemplo n.º 7
0
 def __init__(self, agentName, botconfig, data, **kwargs):
     logger.info("Training Agent " + agentName + " in progress")
     trainingData = load_data(data)
     self.intents = list(trainingData.intents)
     self.entities = list(trainingData.entities)
     trainer = Trainer(config.load(botconfig))
     self.interpreter = trainer.train(trainingData)
     self.model_path = "./models/" + agentName + "/"
     persist_path = trainer.persist(self.model_path)
     self.tar_path = package_model(fingerprint=None,
                                   train_path=persist_path,
                                   output_directory=self.model_path)
     self.model_name = self.tar_path.replace(self.model_path, "")
     self.model_version = self.model_name[:self.model_name.index(".tar.gz")]
Exemplo n.º 8
0
def split_nlu_data(args):
    from rasa.nlu.training_data.loading import load_data
    from rasa.nlu.training_data.util import get_file_format

    data_path = get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH)
    data_path = data.get_nlu_directory(data_path)

    nlu_data = load_data(data_path)
    fformat = get_file_format(data_path)

    train, test = nlu_data.train_test_split(args.training_fraction)

    train.persist(args.out,
                  filename="training_data.{}".format(fformat),
                  fformat=fformat)
    test.persist(args.out,
                 filename="test_data.{}".format(fformat),
                 fformat=fformat)
Exemplo n.º 9
0
    async def get_nlu_data(self,
                           language: Optional[Text] = "en") -> TrainingData:
        fake_data_count = self.DEFAULT_FAKE_DATA_COUNT

        for importer in self.config["importers"]:
            if importer.get("name") == "rasam.PlaceholderImporter":
                fake_data_count = importer.get("fake_data_count",
                                               self.DEFAULT_FAKE_DATA_COUNT)

        faker_ = faker.Faker()
        faker_.seed_instance(fake_data_count)

        training_data = [
            loading.load_data(nlu_file, language)
            for nlu_file in self._nlu_files
        ]

        new_training_data = []

        for data in training_data:
            training_examples = []
            example: Message
            for example in data.training_examples:
                if example.get("intent"):
                    matches = [
                        i async for i in self.find_placeholders(example.text)
                    ]
                    if matches:
                        async for new_message in self.replace_placeholders(
                                example, faker_, matches, fake_data_count):
                            training_examples.append(new_message)
                    else:
                        training_examples.append(example)
                else:
                    training_examples.append(example)
            new_training_data.append(
                TrainingData(training_examples, data.entity_synonyms,
                             data.regex_features, data.lookup_tables,
                             data.nlg_stories))

        merged_training_data = TrainingData().merge(*new_training_data)
        merged_training_data.fill_response_phrases()
        return merged_training_data
Exemplo n.º 10
0
async def _write_nlu_to_file(
    export_nlu_path: Text,
    evts: List[Dict[Text, Any]]
) -> None:
    """Write the nlu data of the sender_id to the file paths."""
    from rasa.nlu.training_data import TrainingData

    msgs = _collect_messages(evts)

    # noinspection PyBroadException
    try:
        previous_examples = load_data(export_nlu_path)
    except Exception as e:
        logger.exception("An exception occurred while trying to load the "
                         "NLU data.")

        export_nlu_path = questionary.text(
            message="Could not load existing NLU data, please "
                    "specify where to store NLU data learned in "
                    "this session (this will overwrite any "
                    "existing file). {}".format(str(e)),
            default=PATHS["backup"]).ask()

        if export_nlu_path is None:
            return

        previous_examples = TrainingData()

    nlu_data = previous_examples.merge(TrainingData(msgs))

    # need to guess the format of the file before opening it to avoid a read
    # in a write
    if _guess_format(export_nlu_path) in {"md", "unk"}:
        fformat = "md"
    else:
        fformat = "json"

    with open(export_nlu_path, 'w', encoding="utf-8") as f:
        if fformat == "md":
            f.write(nlu_data.as_markdown())
        else:
            f.write(nlu_data.as_json())
Exemplo n.º 11
0
def split_nlu_data(args: argparse.Namespace) -> None:
    """Load data from a file path and split the NLU data into test and train examples.

    Args:
        args: Commandline arguments
    """
    from rasa.nlu.training_data.loading import load_data
    from rasa.nlu.training_data.util import get_file_format

    data_path = rasa.cli.utils.get_validated_path(args.nlu, "nlu",
                                                  DEFAULT_DATA_PATH)
    data_path = data.get_nlu_directory(data_path)

    nlu_data = load_data(data_path)
    fformat = get_file_format(data_path)

    train, test = nlu_data.train_test_split(args.training_fraction,
                                            args.random_seed)

    train.persist(args.out, filename=f"training_data.{fformat}")
    test.persist(args.out, filename=f"test_data.{fformat}")
Exemplo n.º 12
0
    def _read_nlu_data(self):
        try:
            cache_dir = sh.ls(FLAGS.cache_dir)
            if 'id2entity.set' in cache_dir and 'entity_examples.set' in cache_dir:
                id2entity_path = os.path.join(FLAGS.cache_dir, 'id2entity.set')
                id2entity_lock_path = id2entity_path + '.lock'

                entity_examples_path = os.path.join(FLAGS.cache_dir, 'entity_examples.set')
                entity_examples_lock_path = entity_examples_path + '.lock'
                
                with FileLock(id2entity_lock_path):
                    id2entity = torch.load(id2entity_path)

                with FileLock(entity_examples_lock_path):
                    entity_examples = torch.load(entity_examples_path)

                return id2entity, entity_examples
        except Exception as e:
            logging.error(e)
            sh.mkdir(FLAGS.cache_dir)
        
        data = load_data(FLAGS.data_dir, 'zh')
        entities, entity_examples = data.entities, data.entity_examples
        entity_lists, entity_examples_cooked = ['O'], []

        for example in entity_examples:
            entity_examples_cooked.append(self._predata(example.text, example.get("entities", [])))

        for entity in entities:
            for tag in ['B', 'I']:
                entity_lists.append(tag + '-' + entity)

        id2entity = dict(enumerate(entity_lists))

        torch.save(id2entity, os.path.join(FLAGS.cache_dir, 'id2entity.set'))
        torch.save(entity_examples_cooked, os.path.join(FLAGS.cache_dir, 'entity_examples.set'))

        return id2entity, entity_examples_cooked
Exemplo n.º 13
0
def test_load_data_from_non_existing_file():
    with pytest.raises(ValueError):
        load_data("some path")
Exemplo n.º 14
0
    async def get_nlu_data(self,
                           language: Optional[Text] = "en") -> TrainingData:
        from rasa.nlu.training_data import loading

        path_to_nlu_file = self._custom_get_nlu_file()
        return loading.load_data(path_to_nlu_file)
Exemplo n.º 15
0
 def generate_domain(self) -> Text:
 
     logger.debug("Generating domain file")
     results = self.get_tagged_entries(self.tag_dict) # Get tagged entities
     
     # if nlu_file is specified, look for intents and entities within the nlu file. 
     if self.nlu_file:
         logger.debug("Extracting entities and intents from nlu training data (%s)" % self.nlu_file)
         nlu_data = loading.load_data(self.nlu_file)
         if nlu_data:
             if len(results["entities"]) == 0:
                 results["entities"] = list(nlu_data.entities)
             if len(results["intents"]) == 0:
                 results["intents"] = list(nlu_data.intents)
                 
     # if actions_file is specified, look for registed actions within actions file.
     # Keep only the actions / forms that were found within the NLU file
     if self.actions_dir:
         logger.debug("Extracting actions from action directory (%s)" % self.actions_dir)
         actions = self.get_actions(self.actions_dir)
         if "actions" in actions.keys(): results["actions"] = actions["actions"]
         if "forms" in actions.keys(): results["forms"] = actions["forms"]
      
     logger.debug("Merging identified utterances")
     # If templates exist, append them to the actions
     if "templates" in results.keys() and len(results["templates"]) > 0:
         results["actions"] = results["actions"] + list(results["templates"].keys())       
     
     logger.debug("Formatting output")
     # Iterate through output, identify existing tags, and remove keys that dont exist
     for tag in VALID_SEARCH_TAGS:
         if tag in results.keys() and len(results[tag]) > 0:
             print("Found %s %s" % (len(results[tag]), tag))
         else:
             # remove the keys from the list
             del results[tag]
             logger.warning("No %s found" % tag)  
 
     # output the results to std out, if an output file was specified, send it there
     yaml = YAML()
     yaml.compact(seq_seq=False, seq_map=False)
     if self.output:
         #output to file
         if os.path.isdir(self.output): 
             logger.error("Output location (%s) is a directory.. can not overwrite" % self.output)
             return "Output location (%s) is a directory.. can not overwrite" % self.output
         elif os.path.isfile(self.output):
             logger.warning("Output file %s already exists, overwritting..." % self.output)
             
         output_path = self.output if os.path.isabs(self.output) else os.path.join(os.path.abspath(os.curdir), self.output)
         
         try:
             stream = open(output_path, "w")
         except IOError:
             stream = open(Path(output_path), "w") #Create the file!
             
         yaml.dump(results, stream)
         print("Results saved to %s" % self.output)
     else:    
         # yaml.dump(results, sys.stdout)
         yaml.dump(results, sys.stdout)