def _process(self, input_pack: MultiPack): r"""Search using Twitter API to fetch tweets for a query. This query should be contained in the input multipack with name `self.config.query_pack_name`. Each result is added as a new data pack, and a `ft.onto.base_ontology.Document` annotation is used to cover the whole document. Args: input_pack: A multipack containing query as a pack. """ query_pack = input_pack.get_pack(self.configs.query_pack_name) query = query_pack.text tweets = self._query_tweets(query) for idx, tweet in enumerate(tweets): try: text = tweet.retweeted_status.full_text except AttributeError: # Not a Retweet text = tweet.full_text pack: DataPack = input_pack.add_pack( f"{self.configs.response_pack_name_prefix}_{idx}") pack.pack_name = f"{self.configs.response_pack_name_prefix}_{idx}" pack.set_text(text) Document(pack=pack, begin=0, end=len(text))
def _parse_pack(self, data_source: str) -> Iterator[MultiPack]: r"""Takes a raw string and converts into a MultiPack. Args: data_source: str that contains text of a document. Returns: MultiPack containing a datapack for the current query. """ multi_pack = MultiPack() # use context to build the query if self.resources is not None and self.resources.get("user_utterance"): multi_pack.add_pack_( self.resources.get("user_utterance")[-1], "user_utterance") if self.resources is not None and self.resources.get("bot_utterance"): multi_pack.add_pack_( self.resources.get("bot_utterance")[-1], "bot_utterance") pack = multi_pack.add_pack(self.configs.pack_name) pack.set_text(data_source, replace_func=self.text_replace_operation) Utterance(pack, 0, len(data_source)) yield multi_pack
def _process(self, input_pack: MultiPack): r"""Searches ElasticSearch indexer to fetch documents for a query. This query should be contained in the input multipack with name `self.config.query_pack_name`. This method adds new packs to `input_pack` containing the retrieved results. Each result is added as a `ft.onto.base_ontology.Document`. Args: input_pack: A multipack containing query as a pack. """ query_pack = input_pack.get_pack(self.config.query_pack_name) # ElasticSearchQueryCreator adds a Query entry to query pack. We now # fetch it as the first element. first_query: Query = query_pack.get_single(Query) results = self.index.search(first_query.value) hits = results["hits"]["hits"] for idx, hit in enumerate(hits): document = hit["_source"] first_query.add_result(document["doc_id"], hit["_score"]) pack: DataPack = input_pack.add_pack( f"{self.config.response_pack_name_prefix}_{idx}") pack.doc_id = document["doc_id"] content = document[self.config.field] pack.set_text(content) Document(pack=pack, begin=0, end=len(content))
def _process(self, input_pack: MultiPack): query = input_pack.get_pack(self.in_pack_name).text params = "?" + urlencode( { "api-version": "3.0", "from": self.src_language, "to": [self.target_language], }, doseq=True, ) microsoft_constructed_url = self.microsoft_translate_url + params response = requests.post( microsoft_constructed_url, headers=self.microsoft_headers, json=[{ "text": query }], ) if response.status_code != 200: raise RuntimeError(response.json()["error"]["message"]) text = response.json()[0]["translations"][0]["text"] pack: DataPack = input_pack.add_pack(self.out_pack_name) pack.set_text(text=text) Document(pack, 0, len(text)) Utterance(pack, 0, len(text))
def test_wrong_attribute(self): input_pack = MultiPack() mp_entry = ExampleMPEntry(input_pack) p1 = input_pack.add_pack('pack1') e1: DifferentEntry = p1.add_entry(DifferentEntry(p1)) with self.assertRaises(TypeError): mp_entry.refer_entry = e1 mp_entry.regret_creation()
def _parse_pack(self, data_source: str) -> Iterator[MultiPack]: fields = data_source.split("\t") multi_pack = MultiPack() data_pack = multi_pack.add_pack(self.config.pack_name) data_pack.doc_id = fields[0] data_pack.set_text(fields[1]) Document(pack=data_pack, begin=0, end=len(fields[1])) yield multi_pack
def test_wrong_attribute(self): import warnings input_pack = MultiPack() mp_entry = ExampleMPEntry(input_pack) p1 = input_pack.add_pack("pack1") e1: DifferentEntry = p1.add_entry(DifferentEntry(p1)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") mp_entry.refer_entry = e1 mp_entry.regret_creation() assert issubclass(w[-1].category, UserWarning)
class SelectorTest(unittest.TestCase): def setUp(self) -> None: pm = PackManager() self.multi_pack = MultiPack(pm) data_pack1 = self.multi_pack.add_pack(ref_name="pack1") data_pack2 = self.multi_pack.add_pack(ref_name="pack2") data_pack3 = self.multi_pack.add_pack(ref_name="pack_three") data_pack1.pack_name = "1" data_pack2.pack_name = "2" data_pack3.pack_name = "Three" def test_name_match_selector(self) -> None: selector = NameMatchSelector(select_name="pack1") packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) def test_regex_name_match_selector(self) -> None: selector = RegexNameMatchSelector(select_name="^.*\\d$") packs = selector.select(self.multi_pack) doc_ids = ["1", "2"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) def test_first_pack_selector(self) -> None: selector = FirstPackSelector() packs = list(selector.select(self.multi_pack)) self.assertEqual(len(packs), 1) self.assertEqual(packs[0].pack_name, "1") def test_all_pack_selector(self) -> None: selector = AllPackSelector() packs = selector.select(self.multi_pack) doc_ids = ["1", "2", "Three"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name)
def _process(self, multi_pack: MultiPack): # Add a pack. p1 = multi_pack.add_pack('pack1') p2 = multi_pack.add_pack('pack2') # Add some entries into one pack. e1: ExampleEntry = p1.add_entry(ExampleEntry(p1)) e1.secret_number = 1 p2.add_entry(ExampleEntry(p2)) # Add the multi pack entry. mp_entry = ExampleMPEntry(multi_pack) mp_entry.refer_entry = e1
def _process(self, input_pack: MultiPack): from_pack: DataPack = input_pack.get_pack(self.configs.copy_from) copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to) copy_pack.set_text(from_pack.text) if from_pack.pack_name is not None: copy_pack.pack_name = from_pack.pack_name + '_copy' else: copy_pack.pack_name = 'copy' ent: EntityMention for ent in from_pack.get(EntityMention): EntityMention(copy_pack, ent.begin, ent.end)
def _process(self, input_pack: MultiPack): from_pack: DataPack = input_pack.get_pack(self.configs.copy_from) copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to) copy_pack.set_text(from_pack.text) if from_pack.pack_name is not None: copy_pack.pack_name = from_pack.pack_name + '_copy' else: copy_pack.pack_name = 'copy' s: Sentence for s in from_pack.get(Sentence): Sentence(copy_pack, s.begin, s.end)
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: # type: ignore with open(file_path, "r", encoding="utf8") as doc: for line in doc: line = line.strip() if len(line) == 0: continue m_pack = MultiPack() pack = m_pack.add_pack("pack") pack.set_text(line) Sentence(pack, 0, len(line)) self.count += 1 yield m_pack # type: ignore
def _process(self, input_pack: MultiPack): query_pack = input_pack.get_pack(self.configs.query_pack_name) first_query = list(query_pack.get(Query))[0] results = self.index.search(first_query.value, self.k) documents = [r[1] for result in results for r in result] packs = {} for i, doc in enumerate(documents): pack = input_pack.add_pack() pack.set_text(doc) Document(pack, 0, len(doc)) packs[self.configs.response_pack_name_prefix + f"_{i}"] = pack input_pack.update_pack(packs)
def test_multi_pack_copy_link_or_group(self): processor = ReplacementDataAugmentProcessor() m_pack = MultiPack() src_pack = m_pack.add_pack("src") tgt_pack = m_pack.add_pack("tgt") src_pack.set_text("input") tgt_pack.set_text("output") src_token = src_pack.add_entry(Token(src_pack, 0, len(src_pack.text))) tgt_token = tgt_pack.add_entry(Token(tgt_pack, 0, len(tgt_pack.text))) mpl = m_pack.add_entry(MultiPackLink(m_pack, src_token, tgt_token)) # The MultiPackLink should not be copied, because its children are not copied. self.assertEqual(processor._copy_multi_pack_link_or_group(mpl, m_pack), False) new_src_pack = processor._auto_align_annotations(src_pack, []) self.assertEqual(len(list(new_src_pack.get(Token))), 1)
def _process(self, input_pack: MultiPack): from_pack: DataPack = input_pack.get_pack(self.configs.copy_from) copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to) copy_pack.set_text(from_pack.text) if from_pack.pack_name is not None: copy_pack.pack_name = from_pack.pack_name + "_copy" else: copy_pack.pack_name = "copy" s: Sentence for s in from_pack.get(Sentence): Sentence(copy_pack, s.begin, s.end) e: EntityMention for e in from_pack.get(EntityMention): EntityMention(copy_pack, e.begin, e.end)
def _process(self, input_pack: MultiPack): r"""Searches `Elasticsearch` indexer to fetch documents for a query. This query should be contained in the input multipack with name `self.config.query_pack_name`. This method adds new packs to `input_pack` containing the retrieved results. Each result is added as a `ft.onto.base_ontology.Document`. Args: input_pack: A multipack containing query as a pack. """ query_pack = input_pack.get_pack(self.configs.query_pack_name) # ElasticSearchQueryCreator adds a Query entry to query pack. We now # fetch it as the first element. first_query: Query = query_pack.get_single(Query) # pylint: disable=isinstance-second-argument-not-valid-type # TODO: until fix: https://github.com/PyCQA/pylint/issues/3507 if not isinstance(first_query.value, Dict): raise ValueError( "The query to the elastic indexer need to be a dictionary.") results = self.index.search(first_query.value) hits = results["hits"]["hits"] for idx, hit in enumerate(hits): document = hit["_source"] first_query.add_result(document["doc_id"], hit["_score"]) if self.configs.indexed_text_only: pack: DataPack = input_pack.add_pack( f"{self.configs.response_pack_name_prefix}_{idx}") pack.pack_name = document["doc_id"] content = document[self.configs.field] pack.set_text(content) Document(pack=pack, begin=0, end=len(content)) else: pack = DataPack.deserialize(document["pack_info"]) input_pack.add_pack_( pack, f"{self.configs.response_pack_name_prefix}_{idx}") pack.pack_name = document["doc_id"]
def _process(self, input_pack: MultiPack): query = input_pack.get_pack(self.in_pack_name).text params = '?' + urlencode( {'api-version': '3.0', 'from': self.src_language, 'to': [self.target_language]}, doseq=True) microsoft_constructed_url = self.microsoft_translate_url + params response = requests.post( microsoft_constructed_url, headers=self.microsoft_headers, json=[{"text": query}]) if response.status_code != 200: raise RuntimeError(response.json()['error']['message']) text = response.json()[0]["translations"][0]["text"] pack: DataPack = input_pack.add_pack(self.out_pack_name) pack.set_text(text=text) Document(pack, 0, len(text)) Utterance(pack, 0, len(text))
class SelectorTest(unittest.TestCase): def setUp(self) -> None: self.multi_pack = MultiPack() data_pack1 = self.multi_pack.add_pack(ref_name="pack1") data_pack2 = self.multi_pack.add_pack(ref_name="pack2") data_pack3 = self.multi_pack.add_pack(ref_name="pack_three") data_pack1.pack_name = "1" data_pack2.pack_name = "2" data_pack3.pack_name = "Three" def test_name_match_selector(self) -> None: selector = NameMatchSelector() selector.initialize( configs={"select_name": "pack1"}, ) packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) # Test reverse selection. selector.initialize( configs={"select_name": "pack1", "reverse_selection": True}, ) packs = selector.select(self.multi_pack) doc_ids = ["2", "Three"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) def test_name_match_selector_backward_compatability(self) -> None: selector = NameMatchSelector(select_name="pack1") selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) selector = NameMatchSelector("pack1") selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) def test_regex_name_match_selector(self) -> None: selector = RegexNameMatchSelector() selector.initialize( configs={"select_name": "^.*\\d$"}, ) packs = selector.select(self.multi_pack) doc_ids = ["1", "2"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) # Test reverse selection. selector.initialize( {"select_name": "^.*\\d$", "reverse_selection": True} ) packs = selector.select(self.multi_pack) doc_ids = ["Three"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) def test_regex_name_match_selector_backward_compatability(self) -> None: selector = RegexNameMatchSelector(select_name="^.*\\d$") selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1", "2"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) # Test different configuration method (backward compatibility) selector = RegexNameMatchSelector("^.*\\d$") selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1", "2"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) # Test reverse selection. selector.initialize({"reverse_selection": True}) packs = selector.select(self.multi_pack) doc_ids = ["Three"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) def test_first_pack_selector(self) -> None: selector = FirstPackSelector() selector.initialize() packs = list(selector.select(self.multi_pack)) self.assertEqual(len(packs), 1) self.assertEqual(packs[0].pack_name, "1") # Test reverse selection. selector.initialize({"reverse_selection": True}) packs = list(selector.select(self.multi_pack)) self.assertEqual(len(packs), len(self.multi_pack.packs) - 1) def test_all_pack_selector(self) -> None: selector = AllPackSelector() selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1", "2", "Three"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name)
class DataPackTest(unittest.TestCase): def setUp(self) -> None: # Note: input source is created automatically by the system, but we # can also set it manually at test cases. pm = PackManager() self.multi_pack = MultiPack(pm) self.data_pack1 = self.multi_pack.add_pack(ref_name="left pack") self.data_pack2 = self.multi_pack.add_pack(ref_name="right pack") self.data_pack1.pack_name = "some pack" self.data_pack1.set_text("This pack contains some sample data.") self.data_pack2.pack_name = "another pack" self.data_pack2.set_text("This pack contains some other sample data.") def test_serialization(self): ser_str: str = self.multi_pack.serialize() print(ser_str) def test_add_pack(self): data_pack3 = self.multi_pack.add_pack(ref_name="new pack") data_pack3.pack_name = "the third pack" data_pack3.set_text("Test to see if we can add new packs..") self.assertEqual(len(self.multi_pack.packs), 3) self.assertEqual(self.multi_pack.pack_names, {'left pack', 'right pack', 'new pack'}) def test_rename_pack(self): self.multi_pack.rename_pack('right pack', 'last pack') self.assertEqual(self.multi_pack.pack_names, {'left pack', 'last pack'}) def test_multipack_groups(self): """ Test some multi pack group. Returns: """ # Add tokens to each pack. for pack in self.multi_pack.packs: _space_token(pack) # Create some group. token: Annotation left_tokens = {} for token in self.multi_pack.packs[0].get(Token): left_tokens[token.text] = token right_tokens = {} for token in self.multi_pack.packs[1].get(Token): right_tokens[token.text] = token for key, lt in left_tokens.items(): if key in right_tokens: rt = right_tokens[key] self.multi_pack.add_entry(MultiPackGroup( self.multi_pack, [lt, rt])) # Check the groups. expected_content = [("This", "This"), ("pack", "pack"), ("contains", "contains"), ("some", "some"), ("sample", "sample"), ("data.", "data.")] group_content = [] g: MultiPackGroup for g in self.multi_pack.get(MultiPackGroup): e: Annotation group_content.append(tuple([e.text for e in g.get_members()])) self.assertListEqual(expected_content, group_content) def test_multipack_entries(self): """ Test some multi pack entry. Returns: """ # 1. Add tokens to each pack. for pack in self.multi_pack.packs: _space_token(pack) left_tokens = [t.text for t in self.multi_pack.packs[0].get(Token)] right_tokens = [t.text for t in self.multi_pack.packs[1].get(Token)] self.assertListEqual(left_tokens, ["This", "pack", "contains", "some", "sample", "data."]) self.assertListEqual(right_tokens, ["This", "pack", "contains", "some", "other", "sample", "data."]) # 2. Link the same words from two packs. token: Annotation left_tokens = {} for token in self.multi_pack.packs[0].get(Token): left_tokens[token.text] = token right_tokens = {} for token in self.multi_pack.packs[1].get(Token): right_tokens[token.text] = token for key, lt in left_tokens.items(): if key in right_tokens: rt = right_tokens[key] self.multi_pack.add_entry(MultiPackLink( self.multi_pack, lt, rt)) # One way to link tokens. linked_tokens = [] for link in self.multi_pack.links: parent_text = link.get_parent().text child_text = link.get_child().text linked_tokens.append((parent_text, child_text)) self.assertListEqual( linked_tokens, [("This", "This"), ("pack", "pack"), ("contains", "contains"), ("some", "some"), ("sample", "sample"), ("data.", "data.")]) # Another way to get the links linked_tokens = [] for link in self.multi_pack.get(MultiPackLink): parent_text = link.get_parent().text child_text = link.get_child().text linked_tokens.append((parent_text, child_text)) self.assertListEqual( linked_tokens, [("This", "This"), ("pack", "pack"), ("contains", "contains"), ("some", "some"), ("sample", "sample"), ("data.", "data.")]) # 3. Test deletion # Delete the second link. self.multi_pack.delete_entry(self.multi_pack.links[1]) linked_tokens = [] for link in self.multi_pack.links: parent_text = link.get_parent().text child_text = link.get_child().text linked_tokens.append((parent_text, child_text)) self.assertListEqual( linked_tokens, [("This", "This"), ("contains", "contains"), ("some", "some"), ("sample", "sample"), ("data.", "data.")])
def _process(self, input_pack: MultiPack): pack = input_pack.add_pack() pack.set_text(input_pack.get_pack_at(0).text)
def _process(self, input_pack: MultiPack): for doc_i in docs: pack = input_pack.add_pack(ref_name=doc_i) pack.set_text(docs[doc_i]) Document(pack, 0, len(pack.text))