def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams(
            {
                "input_pack_name": "query",
                "output_pack_name": "output"
            }, MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        config = HParams(
            {
                "model": {
                    "name": "bert-base-uncased"
                },
                "tokenizer": {
                    "name": "bert-base-uncased"
                },
                "max_seq_length": 128,
                "query_pack_name": "query"
            }, None)
        nlp.add_processor(BertBasedQueryCreator(), config=config)

        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            query_pack = m_pack.get_pack("query")
            self.assertEqual(len(query_pack.generics), 1)
            self.assertIsInstance(query_pack.generics[0], Query)
            query = query_pack.generics[0].value
            self.assertEqual(query.shape, (1, 768))
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams(
            {
                "input_pack_name": "input",
                "output_pack_name": "output"
            }, MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        translator_config = HParams(
            {
                "src_language": "de",
                "target_language": "en",
                "in_pack_name": "input",
                "out_pack_name": "result"
            }, None)

        nlp.add_processor(MicrosoftBingTranslator(), config=translator_config)
        nlp.initialize()

        english_results = ["Hey good morning", "This is Forte. A tool for NLP"]
        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            self.assertEqual(set(m_pack._pack_names),
                             set(["input", "output", "result"]))
            self.assertEqual(
                m_pack.get_pack("result").text, english_results[idx] + "\n")
示例#3
0
def string_processor_example(ner_model_dir: str, srl_model_dir: str):
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    ner_predictor = CoNLLNERPredictor()

    pl.add_processor(ner_predictor, ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_dir,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)

    pl.initialize()

    text = (
        "The plain green Norway spruce is displayed in the gallery's foyer. "
        "Wentworth worked as an assistant to sculptor Henry Moore in the "
        "late 1960s. His reputation as a sculptor grew in the 1980s.")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
    def test_document_and_passage_mode(self, doc_mode):
        resources: Resources = Resources()
        config: HParams = HParams({"doc_mode": doc_mode}, default_hparams=None)
        self.reader.initialize(resources, config)
        data_packs: List[DataPack] = \
            [data_pack for data_pack in self.reader.iter(self.data_dir,
                                                         'dev')]

        # get all queries and all documents
        queries: List[Query] = []
        documents: Dict[str, Document] = dict()
        for data_pack in data_packs:
            query_entries = list(data_pack.get_entries_by_type(Query))
            doc_entries = list(data_pack.get_entries_by_type(Document))

            self.assertTrue(len(query_entries) + len(doc_entries) == 1)

            if len(query_entries) > 0:
                query_entry: Query = query_entries[0]
                queries.append(query_entry)
            else:
                doc_entry: Document = doc_entries[0]
                documents[data_pack.meta.doc_id] = doc_entry

        # match text of documents relevant to the queries to the actual text
        for i, query in enumerate(queries):
            expected_query = self.expected_queries[i]
            expected_ids = self.expected_doc_ids[doc_mode][i]
            self.assertEqual(query.query, expected_query)
            self.assertCountEqual(query.doc_ids["relevant_docs"], expected_ids)
            for doc_id in expected_ids:
                expected_text = self.get_expected_text(doc_id, doc_mode)
                self.assertEqual(documents[doc_id].text, expected_text)
示例#5
0
def create_class_with_kwargs(class_name: str,
                             class_args: Dict,
                             h_params: Optional[Dict] = None):
    cls = get_class(class_name)
    if not class_args:
        class_args = {}
    obj = cls(**class_args)

    if h_params is None:
        h_params = {}

    p_params: Dict = {}

    if "config_path" in h_params and not h_params["config_path"] is None:
        filebased_hparams = yaml.safe_load(open(h_params["config_path"]))
    else:
        filebased_hparams = {}
    p_params.update(filebased_hparams)

    p_params.update(h_params.get("overwrite_configs", {}))
    default_processor_hparams = cls.default_hparams()

    processor_hparams = HParams(p_params, default_processor_hparams)

    return obj, processor_hparams
示例#6
0
文件: utils.py 项目: williamwhe/forte
def create_class_with_kwargs(class_name: str,
                             class_args: Dict,
                             h_params: Optional[Dict] = None):
    r"""Create class with the given arguments.

    Args:
        class_name (str): Class name.
        class_args (Dict): Class arguments.
        h_params (Dict): Hyperparameters for the processor.

    Returns:
        The class object and the hyperparameters for the processor.
    """
    cls = get_class(class_name)
    if not class_args:
        class_args = {}
    obj = cls(**class_args)

    if h_params is None:
        h_params = {}

    p_params: Dict = {}

    if "config_path" in h_params and not h_params["config_path"] is None:
        filebased_hparams = yaml.safe_load(open(h_params["config_path"]))
    else:
        filebased_hparams = {}
    p_params.update(filebased_hparams)

    p_params.update(h_params.get("overwrite_configs", {}))
    default_processor_hparams = cls.default_configs()

    processor_hparams = HParams(p_params, default_processor_hparams)

    return obj, processor_hparams
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = HParams(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_hparams())
    pl.add_processor(processor=StandfordNLPProcessor(models_path),
                     config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
示例#8
0
    def __init__(self, hparams: Optional[Union[Dict, HParams]] = None):
        super().__init__()
        self._hparams = HParams(hparams=hparams,
                                default_hparams=self.default_hparams())
        self._meta_data: Dict[int, str] = {}

        index_type = self._hparams.index_type
        device = self._hparams.device
        dim = self._hparams.dim

        if device.lower().startswith("gpu"):
            if isinstance(index_type,
                          str) and not index_type.startswith("Gpu"):
                index_type = "Gpu" + index_type

            index_class = utils.get_class(index_type, module_paths=["faiss"])
            gpu_resource = faiss.StandardGpuResources()
            gpu_id = int(device[3:])
            if faiss.get_num_gpus() < gpu_id:
                gpu_id = 0
                logging.warning(
                    "Cannot create the index on device %s. "
                    "Total number of GPUs on this machine is "
                    "%s. Using gpu0 for the index.", self._hparams.device,
                    faiss.get_num_gpus())
            config_class_name = \
                self.INDEX_TYPE_TO_CONFIG.get(index_class.__name__)
            config = utils.get_class(config_class_name,
                                     module_paths=["faiss"])()
            config.device = gpu_id
            self._index = index_class(gpu_resource, dim, config)

        else:
            index_class = utils.get_class(index_type, module_paths=["faiss"])
            self._index = index_class(dim)
示例#9
0
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str):
    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_path, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    pl.add_processor(CoNLLNERPredictor(), ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_path,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)
    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
示例#10
0
    def __init__(self, cross_pack: bool = True):
        super().__init__(cross_pack)
        self.batch_is_full = False

        default_config = HParams(None, self.default_configs())
        self.input_pack_name = default_config.input_pack_name
        self.batch_size = default_config.batch_size
        self.initialize(default_config)
示例#11
0
    def set_reader(self, reader: BaseReader,
                   config: Optional[Union[HParams, Dict[str, Any]]] = None):
        self._reader = reader

        if config is None:
            config = reader.default_configs()
        config = HParams(config, reader.default_configs())

        self._reader_config = config
示例#12
0
    def __init__(self, cross_pack: bool = True):
        super().__init__(cross_pack)
        # self.instance_num_in_current_batch = 0
        self.batch_is_full = False
        self.last_batch = False

        default_config = HParams(None, self.default_hparams())
        self.input_pack_name = default_config.input_pack_name
        self.batch_size = default_config.batch_size
        self.initialize(default_config)
示例#13
0
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = HParams({"batcher": {
            "batch_size": 5
        }}, dummy.default_hparams())
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = \
            "forte/processors/base/tests/data_samples/ontonotes/00/"
示例#14
0
 def setUp(self):
     self.stanford_nlp = Pipeline()
     self.stanford_nlp.set_reader(StringReader())
     models_path = os.getcwd()
     config = HParams(
         {
             "processors": "tokenize",
             "lang": "en",
             # Language code for the language to build the Pipeline
             "use_gpu": False
         },
         StandfordNLPProcessor.default_hparams())
     self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path),
                                     config=config)
     self.stanford_nlp.initialize()
示例#15
0
 def test_processor(self, batch_size):
     config = HParams({"batcher": {
         "batch_size": batch_size
     }}, self.dummy.default_hparams())
     self.nlp.add_processor(NLTKSentenceSegmenter())
     self.nlp.add_processor(self.dummy, config=config)
     self.nlp.initialize()
     sentences = [
         "This tool is called Forte. The goal of this project to "
         "help you build NLP pipelines. NLP has never been made "
         "this easy before."
     ]
     pack = self.nlp.process(sentences)
     sent_len = len(list(pack.get(Sentence)))
     self.assertEqual(self.dummy.counter, (sent_len // batch_size +
                                           (sent_len % batch_size > 0)))
示例#16
0
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams({"input_pack_name": "input",
                                 "output_pack_name": "output"},
                                MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            self.assertEqual(m_pack._pack_names, ["input", "output"])
            self.assertEqual(m_pack.get_pack("input").text, texts[idx] + "\n")
示例#17
0
    def add_processor(self, processor: BaseProcessor,
                      config: Optional[Union[HParams, Dict[str, Any]]] = None,
                      selector: Optional[Selector] = None):
        self._processors_index[processor.component_name] = len(self.processors)

        self._processors.append(processor)

        if config is None:
            config = processor.default_configs()
        config = HParams(config, processor.default_configs())

        self.processor_configs.append(config)

        if selector is None:
            self._selectors.append(DummySelector())
        else:
            self._selectors.append(selector)
def main(dataset_dir: str):
    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())
    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
示例#19
0
def main():
    import sys
    ner_dir, srl_dir = sys.argv[  # pylint: disable=unbalanced-tuple-unpacking
        1:3]

    output_config = HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_hparams(),
    )

    eng_text = "The plain green Norway spruce is displayed in the gallery's " \
               "foyer. Wentworth worked as an assistant to sculptor Henry " \
               "Moore in the late 1960s. His reputation as a sculptor grew " \
               "in the 1980s."

    fr_text = "Van Gogh grandit au sein d'une famille de " \
              "l'ancienne bourgeoisie."

    stanford_nlp_example1('en', eng_text, output_config)
    stanford_nlp_example1('fr', fr_text, output_config)

    string_processor_example(ner_dir, srl_dir)
示例#20
0
    def initialize(self, config: HParams):
        config_ = HParams(config, self.default_hparams())
        self.batch_size = config_.batch_size

        # self.instance_num_in_current_batch = 0
        self.batch_is_full = False
示例#21
0
 def __init__(self, cross_pack=True):
     super().__init__(cross_pack)
     # self.instance_num_in_current_batch = 0
     self.batch_is_full = False
     default_config = HParams(None, self.default_hparams())
     self.batch_size = default_config.batch_size
示例#22
0
 def initialize(self, config: HParams):
     config_ = HParams(config, self.default_hparams())
     self.batch_size = config_.batch_size
     self.batch_is_full = False
示例#23
0
 def __init__(self, cross_pack=True):
     super().__init__(cross_pack)
     self.batch_is_full = False
     self.last_batch = False
     default_config = HParams(None, self.default_hparams())
     self.batch_size = default_config.batch_size
示例#24
0
from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(DocIdJsonPackWriter(), HParams(
    {
        'output_dir': '.'
    },
    DocIdJsonPackWriter.default_hparams(),
))

nlp.initialize()

nlp.run(data_path)
示例#25
0
from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(
    DocIdJsonPackWriter(),
    HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_hparams(),
    ))

nlp.initialize()

nlp.run(data_path)
示例#26
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(reader=MultiPackTerminalReader(),
                              config=config.reader)

    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.translator)
    query_pipeline.add_processor(processor=BertBasedQueryCreator(),
                                 config=config.query_creator)
    query_pipeline.add_processor(processor=SearchProcessor(),
                                 config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=SRLPredictor(),
        config=config.SRL,
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack(config.translator.in_pack_name)
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack(config.back_translator.in_pack_name)

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack(config.indexer.response_pack_name[0])
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
示例#27
0
 def __init__(self) -> None:
     super().__init__()
     self.config = HParams(None, self.default_configs())
示例#28
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    if not os.path.exists(config.indexer.model_dir):
        print(f"Creating a new index...")
        encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        encoder.to(device)

        feature_original_types = {
            "id": ["int64", "FixedLenFeature"],
            "input_ids": ["int64", "FixedLenFeature",
                          config.indexer.max_seq_length],
            "segment_ids": ["int64", "FixedLenFeature",
                            config.indexer.max_seq_length],
            "text": ["str", "FixedLenFeature"]
        }

        hparam = {
            "allow_smaller_final_batch": True,
            "batch_size": config.indexer.batch_size,
            "dataset": {
                "data_name": "data",
                "feature_original_types": feature_original_types,
                "files": config.indexer.pickle_data_dir
            },
            "shuffle": False
        }

        print(f"Embedding the text using BERTEncoder...")
        record_data = RecordData(hparams=hparam, device=device)
        data_iterator = DataIterator(record_data)
        index = EmbeddingBasedIndexer(hparams={
            "index_type": "GpuIndexFlatIP",
            "dim": 768,
            "device": "gpu0"
        })

        for idx, batch in enumerate(data_iterator):
            ids = batch["id"]
            input_ids = batch["input_ids"]
            segment_ids = batch["segment_ids"]
            text = batch["text"]
            _, pooled_output = get_embeddings(encoder, input_ids, segment_ids)
            index.add(vectors=pooled_output,
                      meta_data={k.item(): v for k, v in zip(ids, text)})

            if (idx + 1) % 50 == 0:
                print(f"Completed {idx+1} batches of size "
                      f"{config.indexer.batch_size}")

        index.save(path=config.indexer.model_dir)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(MultiPackTerminalReader())

    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.translator)
    query_pipeline.add_processor(
        processor=QueryCreator(), config=config.query_creator)
    query_pipeline.add_processor(
        processor=SearchProcessor(), config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name="doc_0"))
    # query_pipeline.add_processor(
    #    processor=CoNLLNERPredictor(), config=config.NER,
    #    selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack("query")
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack("response")

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack("doc_0")
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
from termcolor import colored
from texar.torch import HParams

from forte.data.readers import StringReader
from forte.pipeline import Pipeline
from forte.processors import CoNLLNERPredictor, SRLPredictor
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter

from ft.onto.base_ontology import Token, Sentence, PredicateLink, \
    PredicateMention, PredicateArgument, EntityMention

config = yaml.safe_load(open("config.yml", "r"))

config = HParams(config, default_hparams=None)


def main():
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
示例#30
0
def create_hparams(hparams_string=None, verbose=False):
    """Create model hyperparameters. Parse nondefault from given string."""

    hparams = {
        ################################
        # Experiment Parameters        #
        ################################
        "epochs": 50000,
        "iters_per_checkpoint": 500,
        "seed": 1234,
        "dynamic_loss_scaling": True,
        "fp16_run": False,
        "distributed_run": False,
        "dist_backend": "nccl",
        "dist_url": "tcp://localhost:54321",
        "cudnn_enabled": True,
        "cudnn_benchmark": False,
        "ignore_layers": ['speaker_embedding.weight'],

        ################################
        # Data Parameters             #
        ################################
        "training_files": 'filelists/tts_dataset/info_tsukuyomi.txt',
        "validation_files": 'filelists/tts_dataset/info_tsukuyomi.txt',
        "text_cleaners": ['basic_cleaners'],
        "p_arpabet": 1.0,
        "cmudict_path": "data/cmu_dictionary",

        ################################
        # Audio Parameters             #
        ################################
        # librosa=(-1,1)notame,hennkou
        "max_wav_value": 1.0,
        "sampling_rate": 22050,
        "filter_length": 1024,
        "hop_length": 256,
        "win_length": 1024,
        "n_mel_channels": 80,
        "mel_fmin": 0.0,
        "mel_fmax": 8000.0,
        "f0_min": 80,
        "f0_max": 880,
        "harm_thresh": 0.25,

        ################################
        # Model Parameters             #
        ################################
        "n_symbols": len(symbols),
        "symbols_embedding_dim": 512,

        # Encoder parameters
        "encoder_kernel_size": 5,
        "encoder_n_convolutions": 3,
        "encoder_embedding_dim": 512,

        # Decoder parameters
        "n_frames_per_step": 1,  # currently only 1 is supported
        "decoder_rnn_dim": 1024,
        "prenet_dim": 256,
        "prenet_f0_n_layers": 1,
        "prenet_f0_dim": 1,
        "prenet_f0_kernel_size": 1,
        "prenet_rms_dim": 0,
        "prenet_rms_kernel_size": 1,
        "max_decoder_steps": 1000,
        "gate_threshold": 0.5,
        "p_attention_dropout": 0.1,
        "p_decoder_dropout": 0.1,
        "p_teacher_forcing": 1.0,

        # Attention parameters
        "attention_rnn_dim": 1024,
        "attention_dim": 128,

        # Location Layer parameters
        "attention_location_n_filters": 32,
        "attention_location_kernel_size": 31,

        # Mel-post processing network parameters
        "postnet_embedding_dim": 512,
        "postnet_kernel_size": 5,
        "postnet_n_convolutions": 5,

        # Speaker embedding
        "n_speakers": 123,
        "speaker_embedding_dim": 128,

        # Reference encoder
        "with_gst": True,
        "ref_enc_filters": [32, 32, 64, 64, 128, 128],
        "ref_enc_size": [3, 3],
        "ref_enc_strides": [2, 2],
        "ref_enc_pad": [1, 1],
        "ref_enc_gru_size": 128,

        # Style Token Layer
        "token_embedding_size": 256,
        "token_num": 10,
        "num_heads": 8,

        ################################
        # Optimization Hyperparameters #
        ################################
        "use_saved_learning_rate": False,
        "learning_rate": 1e-3,
        "learning_rate_min": 1e-5,
        "learning_rate_anneal": 50000,
        "weight_decay": 1e-6,
        "grad_clip_thresh": 1.0,
        "batch_size": 32,
        "mask_padding": True,  # set model's padded outputs to padded values
    }
    """
    if hparams_string:
        tf.compat.v1.logging.info(
            'Parsing command line hparams: %s', hparams_string)
        hparams.parse(hparams_string)

    if verbose:
        tf.compat.v1.logging.info('Final parsed hparams: %s', hparams.values())
    """
    hparams = HParams(hparams, hparams)
    return hparams