def test_duplicate_ids_not_allowed(self, mock_download_dataset): self.write_questions(HotpotQAType.TRAIN, [TRAIN_QUESTION, TRAIN_QUESTION]) with self.assertRaises(AssertionError): dm.HOTPOT_QA(HotpotQAType.TRAIN) mock_download_dataset.assert_called_once_with(Collection.HOTPOT_QA, ANY) # noqa: E501
def test_load_in_test_fullwiki_format(self, mock_download_dataset): self.write_questions(HotpotQAType.TEST_FULLWIKI, [TEST_FULLWIKI_QUESTION]) # noqa: E501 df = dm.HOTPOT_QA(HotpotQAType.TEST_FULLWIKI) expected_df = pd.DataFrame( json.loads("""[ { "id": "5ab5072e5542990594ba9cda", "question": "Test question?", "answer": null, "gold_paragraphs": [], "supporting_facts": [], "context": [ [ "The Rolling Stone Album Guide", ["Sent 1.", " Sent 2."] ], [ "Fear and Loathing at Rolling Stone", ["Sent 3."] ] ], "question_type": null, "question_level": null } ]""")) pd.testing.assert_frame_equal(df, expected_df) mock_download_dataset.assert_called_once_with(Collection.HOTPOT_QA, ANY) # noqa: E501
def test_load_multiple_questions(self, mock_download_dataset): similar_question = deepcopy(TRAIN_QUESTION) similar_question["_id"] = "aaaabbbbccccdddd!2" self.write_questions(HotpotQAType.TRAIN, [TRAIN_QUESTION, similar_question]) df = dm.HOTPOT_QA(HotpotQAType.TRAIN) mock_download_dataset.assert_called_once_with(Collection.HOTPOT_QA, ANY) # noqa: E501 self.assertEqual(len(df), 2)
def main(): df = dm.HOTPOT_QA(HotpotQAType.DEV_DISTRACTOR) print(df) print("\n") df = df.sample(n=1) row = next(df.iterrows())[1] gold_paragraphs = row.gold_paragraphs print("Question: ", row.question) print("Answer: ", row.answer, "\n") for i, paragraph in enumerate(gold_paragraphs): print("Paragraph {}) {}\n".format(chr(ord('A') + i), paragraph))
def test_load_in_dev_fullwiki_format(self, mock_download_dataset): self.write_questions(HotpotQAType.DEV_FULLWIKI, [DEV_FULLWIKI_QUESTION]) # noqa: E501 df = dm.HOTPOT_QA(HotpotQAType.DEV_FULLWIKI) expected_df = pd.DataFrame( json.loads("""[ { "id": "5a899013554299515336131a", "question": "Some question", "answer": "Dallas", "gold_paragraphs": [ "Sent good 3." ], "supporting_facts": [ ["Limitless (EP)", 0], ["Crown the Empire", 0] ], "context": [ [ "The Resistance: Rise of The Runaways", ["Sent 1", " Sent 2.", " Sent 3."] ], [ "Reign of Terror (Capture the Crown album)", ["Sent 4."] ], [ "Retrograde (Crown the Empire album)", ["Sent 5.", " Sent 6."] ], [ "Roots (Sepultura album)", ["Sent 7.", " Sent 8.", " Sent 9."] ], [ "Forest Stream", ["Sent 10.", " Sent 11.", " Sent 12.", " Sent 13.", " Sent 14.", " Sent 15."] ], [ "The Crown (band)", ["Sent 16."] ], [ "The Fallout (Crown the Empire album)", ["Sent 17.", " Sent 18."] ], [ "Crown the Empire discography", [" Sent 19."] ], [ "Crown the Empire - is missing", ["Sent good 1.", " Sent good 2."] ], [ "Limitless (EP)", ["Sent good 3."] ] ], "question_type": "bridge", "question_level": "medium" } ]""")) pd.testing.assert_frame_equal(df, expected_df) mock_download_dataset.assert_called_once_with(Collection.HOTPOT_QA, ANY) # noqa: E501
def test_empty_dataset(self, mock_download_dataset): self.write_questions(HotpotQAType.TRAIN, []) df = dm.HOTPOT_QA(HotpotQAType.TRAIN) mock_download_dataset.assert_called_once_with(Collection.HOTPOT_QA, ANY) # noqa: E501 self.assertEqual(len(df), 0)