Exemplo n.º 1
0
    def _process(self, input_pack: MultiPack):
        r"""Search using Twitter API to fetch tweets for a query.
        This query should be contained in the input multipack with name
        `self.config.query_pack_name`.
        Each result is added as a new data pack, and a
        `ft.onto.base_ontology.Document` annotation is used to cover the whole
        document.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.configs.query_pack_name)

        query = query_pack.text
        tweets = self._query_tweets(query)

        for idx, tweet in enumerate(tweets):
            try:
                text = tweet.retweeted_status.full_text

            except AttributeError:  # Not a Retweet
                text = tweet.full_text

            pack: DataPack = input_pack.add_pack(
                f"{self.configs.response_pack_name_prefix}_{idx}")
            pack.pack_name = f"{self.configs.response_pack_name_prefix}_{idx}"

            pack.set_text(text)

            Document(pack=pack, begin=0, end=len(text))
Exemplo n.º 2
0
    def _parse_pack(self, data_source: str) -> Iterator[MultiPack]:
        r"""Takes a raw string and converts into a MultiPack.

        Args:
            data_source: str that contains text of a document.

        Returns: MultiPack containing a datapack for the current query.
        """
        multi_pack = MultiPack()

        # use context to build the query
        if self.resources is not None and self.resources.get("user_utterance"):
            multi_pack.add_pack_(
                self.resources.get("user_utterance")[-1], "user_utterance")

        if self.resources is not None and self.resources.get("bot_utterance"):
            multi_pack.add_pack_(
                self.resources.get("bot_utterance")[-1], "bot_utterance")

        pack = multi_pack.add_pack(self.configs.pack_name)
        pack.set_text(data_source, replace_func=self.text_replace_operation)

        Utterance(pack, 0, len(data_source))

        yield multi_pack
Exemplo n.º 3
0
    def _process(self, input_pack: MultiPack):
        r"""Searches ElasticSearch indexer to fetch documents for a query. This
        query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.config.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query: Query = query_pack.get_single(Query)
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]

        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.add_result(document["doc_id"], hit["_score"])

            pack: DataPack = input_pack.add_pack(
                f"{self.config.response_pack_name_prefix}_{idx}")
            pack.doc_id = document["doc_id"]

            content = document[self.config.field]
            pack.set_text(content)

            Document(pack=pack, begin=0, end=len(content))
Exemplo n.º 4
0
    def _process(self, input_pack: MultiPack):
        query = input_pack.get_pack(self.in_pack_name).text
        params = "?" + urlencode(
            {
                "api-version": "3.0",
                "from": self.src_language,
                "to": [self.target_language],
            },
            doseq=True,
        )
        microsoft_constructed_url = self.microsoft_translate_url + params

        response = requests.post(
            microsoft_constructed_url,
            headers=self.microsoft_headers,
            json=[{
                "text": query
            }],
        )

        if response.status_code != 200:
            raise RuntimeError(response.json()["error"]["message"])

        text = response.json()[0]["translations"][0]["text"]
        pack: DataPack = input_pack.add_pack(self.out_pack_name)
        pack.set_text(text=text)

        Document(pack, 0, len(text))
        Utterance(pack, 0, len(text))
Exemplo n.º 5
0
    def test_wrong_attribute(self):
        input_pack = MultiPack()
        mp_entry = ExampleMPEntry(input_pack)
        p1 = input_pack.add_pack('pack1')
        e1: DifferentEntry = p1.add_entry(DifferentEntry(p1))

        with self.assertRaises(TypeError):
            mp_entry.refer_entry = e1

        mp_entry.regret_creation()
Exemplo n.º 6
0
    def _parse_pack(self, data_source: str) -> Iterator[MultiPack]:
        fields = data_source.split("\t")
        multi_pack = MultiPack()

        data_pack = multi_pack.add_pack(self.config.pack_name)

        data_pack.doc_id = fields[0]
        data_pack.set_text(fields[1])

        Document(pack=data_pack, begin=0, end=len(fields[1]))

        yield multi_pack
Exemplo n.º 7
0
    def test_wrong_attribute(self):
        import warnings

        input_pack = MultiPack()
        mp_entry = ExampleMPEntry(input_pack)
        p1 = input_pack.add_pack("pack1")
        e1: DifferentEntry = p1.add_entry(DifferentEntry(p1))
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            mp_entry.refer_entry = e1
            mp_entry.regret_creation()
            assert issubclass(w[-1].category, UserWarning)
Exemplo n.º 8
0
class SelectorTest(unittest.TestCase):
    def setUp(self) -> None:
        pm = PackManager()
        self.multi_pack = MultiPack(pm)

        data_pack1 = self.multi_pack.add_pack(ref_name="pack1")
        data_pack2 = self.multi_pack.add_pack(ref_name="pack2")
        data_pack3 = self.multi_pack.add_pack(ref_name="pack_three")

        data_pack1.pack_name = "1"
        data_pack2.pack_name = "2"
        data_pack3.pack_name = "Three"

    def test_name_match_selector(self) -> None:
        selector = NameMatchSelector(select_name="pack1")
        packs = selector.select(self.multi_pack)
        doc_ids = ["1"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

    def test_regex_name_match_selector(self) -> None:
        selector = RegexNameMatchSelector(select_name="^.*\\d$")
        packs = selector.select(self.multi_pack)
        doc_ids = ["1", "2"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

    def test_first_pack_selector(self) -> None:
        selector = FirstPackSelector()
        packs = list(selector.select(self.multi_pack))
        self.assertEqual(len(packs), 1)
        self.assertEqual(packs[0].pack_name, "1")

    def test_all_pack_selector(self) -> None:
        selector = AllPackSelector()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1", "2", "Three"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)
Exemplo n.º 9
0
    def _process(self, multi_pack: MultiPack):
        # Add a pack.
        p1 = multi_pack.add_pack('pack1')
        p2 = multi_pack.add_pack('pack2')

        # Add some entries into one pack.
        e1: ExampleEntry = p1.add_entry(ExampleEntry(p1))
        e1.secret_number = 1
        p2.add_entry(ExampleEntry(p2))

        # Add the multi pack entry.
        mp_entry = ExampleMPEntry(multi_pack)
        mp_entry.refer_entry = e1
Exemplo n.º 10
0
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + '_copy'
        else:
            copy_pack.pack_name = 'copy'

        ent: EntityMention
        for ent in from_pack.get(EntityMention):
            EntityMention(copy_pack, ent.begin, ent.end)
Exemplo n.º 11
0
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + '_copy'
        else:
            copy_pack.pack_name = 'copy'

        s: Sentence
        for s in from_pack.get(Sentence):
            Sentence(copy_pack, s.begin, s.end)
Exemplo n.º 12
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:  # type: ignore
        with open(file_path, "r", encoding="utf8") as doc:
            for line in doc:
                line = line.strip()
                if len(line) == 0:
                    continue

                m_pack = MultiPack()
                pack = m_pack.add_pack("pack")
                pack.set_text(line)

                Sentence(pack, 0, len(line))
                self.count += 1

                yield m_pack  # type: ignore
Exemplo n.º 13
0
    def _process(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack(self.configs.query_pack_name)
        first_query = list(query_pack.get(Query))[0]
        results = self.index.search(first_query.value, self.k)
        documents = [r[1] for result in results for r in result]

        packs = {}
        for i, doc in enumerate(documents):
            pack = input_pack.add_pack()
            pack.set_text(doc)

            Document(pack, 0, len(doc))
            packs[self.configs.response_pack_name_prefix + f"_{i}"] = pack

        input_pack.update_pack(packs)
    def test_multi_pack_copy_link_or_group(self):
        processor = ReplacementDataAugmentProcessor()
        m_pack = MultiPack()
        src_pack = m_pack.add_pack("src")
        tgt_pack = m_pack.add_pack("tgt")

        src_pack.set_text("input")
        tgt_pack.set_text("output")
        src_token = src_pack.add_entry(Token(src_pack, 0, len(src_pack.text)))
        tgt_token = tgt_pack.add_entry(Token(tgt_pack, 0, len(tgt_pack.text)))

        mpl = m_pack.add_entry(MultiPackLink(m_pack, src_token, tgt_token))
        # The MultiPackLink should not be copied, because its children are not copied.
        self.assertEqual(processor._copy_multi_pack_link_or_group(mpl, m_pack), False)
        new_src_pack = processor._auto_align_annotations(src_pack, [])
        self.assertEqual(len(list(new_src_pack.get(Token))), 1)
Exemplo n.º 15
0
    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + "_copy"
        else:
            copy_pack.pack_name = "copy"

        s: Sentence
        for s in from_pack.get(Sentence):
            Sentence(copy_pack, s.begin, s.end)

        e: EntityMention
        for e in from_pack.get(EntityMention):
            EntityMention(copy_pack, e.begin, e.end)
Exemplo n.º 16
0
    def _process(self, input_pack: MultiPack):
        r"""Searches `Elasticsearch` indexer to fetch documents for a query.
        This query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.configs.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query: Query = query_pack.get_single(Query)
        # pylint: disable=isinstance-second-argument-not-valid-type
        # TODO: until fix: https://github.com/PyCQA/pylint/issues/3507
        if not isinstance(first_query.value, Dict):
            raise ValueError(
                "The query to the elastic indexer need to be a dictionary.")
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]

        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.add_result(document["doc_id"], hit["_score"])

            if self.configs.indexed_text_only:
                pack: DataPack = input_pack.add_pack(
                    f"{self.configs.response_pack_name_prefix}_{idx}")
                pack.pack_name = document["doc_id"]

                content = document[self.configs.field]
                pack.set_text(content)

                Document(pack=pack, begin=0, end=len(content))

            else:
                pack = DataPack.deserialize(document["pack_info"])
                input_pack.add_pack_(
                    pack, f"{self.configs.response_pack_name_prefix}_{idx}")
                pack.pack_name = document["doc_id"]
Exemplo n.º 17
0
    def _process(self, input_pack: MultiPack):
        query = input_pack.get_pack(self.in_pack_name).text
        params = '?' + urlencode(
            {'api-version': '3.0',
             'from': self.src_language,
             'to': [self.target_language]}, doseq=True)
        microsoft_constructed_url = self.microsoft_translate_url + params

        response = requests.post(
            microsoft_constructed_url, headers=self.microsoft_headers,
            json=[{"text": query}])

        if response.status_code != 200:
            raise RuntimeError(response.json()['error']['message'])

        text = response.json()[0]["translations"][0]["text"]
        pack: DataPack = input_pack.add_pack(self.out_pack_name)
        pack.set_text(text=text)

        Document(pack, 0, len(text))
        Utterance(pack, 0, len(text))
Exemplo n.º 18
0
class SelectorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.multi_pack = MultiPack()

        data_pack1 = self.multi_pack.add_pack(ref_name="pack1")
        data_pack2 = self.multi_pack.add_pack(ref_name="pack2")
        data_pack3 = self.multi_pack.add_pack(ref_name="pack_three")

        data_pack1.pack_name = "1"
        data_pack2.pack_name = "2"
        data_pack3.pack_name = "Three"

    def test_name_match_selector(self) -> None:
        selector = NameMatchSelector()
        selector.initialize(
            configs={"select_name": "pack1"},
        )
        packs = selector.select(self.multi_pack)
        doc_ids = ["1"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

        # Test reverse selection.
        selector.initialize(
            configs={"select_name": "pack1", "reverse_selection": True},
        )
        packs = selector.select(self.multi_pack)
        doc_ids = ["2", "Three"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

    def test_name_match_selector_backward_compatability(self) -> None:
        selector = NameMatchSelector(select_name="pack1")
        selector.initialize()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

        selector = NameMatchSelector("pack1")
        selector.initialize()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

    def test_regex_name_match_selector(self) -> None:
        selector = RegexNameMatchSelector()
        selector.initialize(
            configs={"select_name": "^.*\\d$"},
        )
        packs = selector.select(self.multi_pack)
        doc_ids = ["1", "2"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

        # Test reverse selection.
        selector.initialize(
            {"select_name": "^.*\\d$", "reverse_selection": True}
        )
        packs = selector.select(self.multi_pack)
        doc_ids = ["Three"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

    def test_regex_name_match_selector_backward_compatability(self) -> None:
        selector = RegexNameMatchSelector(select_name="^.*\\d$")
        selector.initialize()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1", "2"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

        # Test different configuration method (backward compatibility)
        selector = RegexNameMatchSelector("^.*\\d$")
        selector.initialize()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1", "2"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

        # Test reverse selection.
        selector.initialize({"reverse_selection": True})
        packs = selector.select(self.multi_pack)
        doc_ids = ["Three"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

    def test_first_pack_selector(self) -> None:
        selector = FirstPackSelector()
        selector.initialize()
        packs = list(selector.select(self.multi_pack))
        self.assertEqual(len(packs), 1)
        self.assertEqual(packs[0].pack_name, "1")

        # Test reverse selection.
        selector.initialize({"reverse_selection": True})
        packs = list(selector.select(self.multi_pack))
        self.assertEqual(len(packs), len(self.multi_pack.packs) - 1)

    def test_all_pack_selector(self) -> None:
        selector = AllPackSelector()
        selector.initialize()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1", "2", "Three"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)
Exemplo n.º 19
0
class DataPackTest(unittest.TestCase):

    def setUp(self) -> None:
        # Note: input source is created automatically by the system, but we
        #  can also set it manually at test cases.
        pm = PackManager()
        self.multi_pack = MultiPack(pm)
        self.data_pack1 = self.multi_pack.add_pack(ref_name="left pack")
        self.data_pack2 = self.multi_pack.add_pack(ref_name="right pack")

        self.data_pack1.pack_name = "some pack"
        self.data_pack1.set_text("This pack contains some sample data.")

        self.data_pack2.pack_name = "another pack"
        self.data_pack2.set_text("This pack contains some other sample data.")

    def test_serialization(self):
        ser_str: str = self.multi_pack.serialize()
        print(ser_str)

    def test_add_pack(self):
        data_pack3 = self.multi_pack.add_pack(ref_name="new pack")
        data_pack3.pack_name = "the third pack"
        data_pack3.set_text("Test to see if we can add new packs..")

        self.assertEqual(len(self.multi_pack.packs), 3)
        self.assertEqual(self.multi_pack.pack_names,
                         {'left pack', 'right pack', 'new pack'})

    def test_rename_pack(self):
        self.multi_pack.rename_pack('right pack', 'last pack')
        self.assertEqual(self.multi_pack.pack_names,
                         {'left pack', 'last pack'})

    def test_multipack_groups(self):
        """
        Test some multi pack group.
        Returns:

        """
        # Add tokens to each pack.
        for pack in self.multi_pack.packs:
            _space_token(pack)

        # Create some group.
        token: Annotation
        left_tokens = {}
        for token in self.multi_pack.packs[0].get(Token):
            left_tokens[token.text] = token

        right_tokens = {}
        for token in self.multi_pack.packs[1].get(Token):
            right_tokens[token.text] = token

        for key, lt in left_tokens.items():
            if key in right_tokens:
                rt = right_tokens[key]
                self.multi_pack.add_entry(MultiPackGroup(
                    self.multi_pack, [lt, rt]))

        # Check the groups.
        expected_content = [("This", "This"), ("pack", "pack"),
                            ("contains", "contains"), ("some", "some"),
                            ("sample", "sample"), ("data.", "data.")]

        group_content = []
        g: MultiPackGroup
        for g in self.multi_pack.get(MultiPackGroup):
            e: Annotation
            group_content.append(tuple([e.text for e in g.get_members()]))

        self.assertListEqual(expected_content, group_content)

    def test_multipack_entries(self):
        """
        Test some multi pack entry.
        Returns:

        """
        # 1. Add tokens to each pack.
        for pack in self.multi_pack.packs:
            _space_token(pack)

        left_tokens = [t.text for t in self.multi_pack.packs[0].get(Token)]
        right_tokens = [t.text for t in self.multi_pack.packs[1].get(Token)]

        self.assertListEqual(left_tokens,
                             ["This", "pack", "contains", "some", "sample",
                              "data."])
        self.assertListEqual(right_tokens,
                             ["This", "pack", "contains", "some", "other",
                              "sample", "data."])

        # 2. Link the same words from two packs.
        token: Annotation
        left_tokens = {}
        for token in self.multi_pack.packs[0].get(Token):
            left_tokens[token.text] = token

        right_tokens = {}
        for token in self.multi_pack.packs[1].get(Token):
            right_tokens[token.text] = token

        for key, lt in left_tokens.items():
            if key in right_tokens:
                rt = right_tokens[key]
                self.multi_pack.add_entry(MultiPackLink(
                    self.multi_pack, lt, rt))

        # One way to link tokens.
        linked_tokens = []
        for link in self.multi_pack.links:
            parent_text = link.get_parent().text
            child_text = link.get_child().text
            linked_tokens.append((parent_text, child_text))

        self.assertListEqual(
            linked_tokens,
            [("This", "This"), ("pack", "pack"), ("contains", "contains"),
             ("some", "some"), ("sample", "sample"), ("data.", "data.")])

        # Another way to get the links
        linked_tokens = []
        for link in self.multi_pack.get(MultiPackLink):
            parent_text = link.get_parent().text
            child_text = link.get_child().text
            linked_tokens.append((parent_text, child_text))

        self.assertListEqual(
            linked_tokens,
            [("This", "This"), ("pack", "pack"), ("contains", "contains"),
             ("some", "some"), ("sample", "sample"), ("data.", "data.")])

        # 3. Test deletion

        # Delete the second link.
        self.multi_pack.delete_entry(self.multi_pack.links[1])

        linked_tokens = []
        for link in self.multi_pack.links:
            parent_text = link.get_parent().text
            child_text = link.get_child().text
            linked_tokens.append((parent_text, child_text))

        self.assertListEqual(
            linked_tokens,
            [("This", "This"), ("contains", "contains"),
             ("some", "some"), ("sample", "sample"), ("data.", "data.")])
Exemplo n.º 20
0
 def _process(self, input_pack: MultiPack):
     pack = input_pack.add_pack()
     pack.set_text(input_pack.get_pack_at(0).text)
Exemplo n.º 21
0
 def _process(self, input_pack: MultiPack):
     for doc_i in docs:
         pack = input_pack.add_pack(ref_name=doc_i)
         pack.set_text(docs[doc_i])
         Document(pack, 0, len(pack.text))