예제 #1
0
    def test_datamine_cache_dir(self):
        # No env variable is present.
        if DATAMINE_CACHE_DIR_ENV_VAR in os.environ:
            del os.environ[DATAMINE_CACHE_DIR_ENV_VAR]
        with patch('os.path.isfile', return_value=False):
            with patch('os.path.isdir', return_value=True):
                self.assertEqual(os.path.basename(datamine_cache_dir()),
                                 ".datamine_cache_dir")

        # Env variable is set.
        os.environ[DATAMINE_CACHE_DIR_ENV_VAR] = "my-fake-dir-for-testing"
        with patch('os.path.isfile', return_value=False):
            with patch('os.path.isdir', return_value=True):
                self.assertEqual(os.path.basename(datamine_cache_dir()),
                                 "my-fake-dir-for-testing")

        # Env variable is set but empty. Fall back to the default dir.
        os.environ[DATAMINE_CACHE_DIR_ENV_VAR] = ""
        with patch('os.path.isfile', return_value=False):
            with patch('os.path.isdir', return_value=True):
                self.assertEqual(os.path.basename(datamine_cache_dir()),
                                 ".datamine_cache_dir")

        # Directory is created if it does not exist.
        with patch('os.path.isfile', return_value=False):
            with patch('os.path.isdir', return_value=False):
                cache_dir = None
                with patch('os.makedirs') as makedirs_mock:
                    cache_dir = datamine_cache_dir()
                makedirs_mock.assert_called_once_with(cache_dir, mode=493)

        # Directory is not created if already exists.
        with patch('os.path.isfile', return_value=False):
            with patch('os.path.isdir', return_value=True):
                with patch('os.makedirs') as makedirs_mock:
                    datamine_cache_dir()
                makedirs_mock.assert_not_called()

        # Program exists if the cached directory is a file.
        with patch('sys.stdout', new_callable=StringIO) as mock_stdout:
            with patch('os.path.isfile', return_value=True):
                with self.assertRaises(SystemExit) as context:
                    datamine_cache_dir()
                self.assertEqual(context.exception.code, 1)
        self.assertIn("is a file.\n", mock_stdout.getvalue())

        # The function checks if the directory is a file before
        # creating a new directory (and the argument is correct).
        with patch('os.path.isfile') as mock_isfile:
            mock_isfile.return_value = False
            cache_dir = None
            with patch('os.path.isdir', return_value=True):
                cache_dir = datamine_cache_dir()
            mock_isfile.assert_called_once_with(cache_dir)
예제 #2
0
    def test_dataset_is_downloaded_if_missing(self, mock_config):
        mock_config.return_value = self.FAKE_CONFIG
        responses.add(responses.GET, "http://fake-website.com/my/files.zip",
                      body=self.FAKE_URL_DATA1, status=200,
                      headers={'content-length': str(len(self.FAKE_URL_DATA1))},  # noqa: E501
                      stream=True)
        responses.add(responses.GET, "http://fake-website.com/my2/file.json",
                      body=self.FAKE_URL_DATA2, status=200,
                      headers={'content-length': str(len(self.FAKE_URL_DATA2))},  # noqa: E501
                      stream=True)
        return_code = download_dataset(Collection.RACE, lambda _: False)
        self.assertEqual(return_code, 2)

        data_dir = os.path.join(datamine_cache_dir(), self.FAKE_DATASET.name)
        self.assertEqual(
                open(os.path.join(data_dir, "1.txt"), "rt").read(),
                "First question"
        )
        self.assertEqual(
                open(os.path.join(data_dir, "2.txt"), "rt").read(),
                "Second question"
        )
        self.assertEqual(
                open(os.path.join(data_dir, "dir/3.txt"), "rt").read(),
                "Third question"
        )
        self.assertEqual(
                open(os.path.join(data_dir, "file.json"), "rt").read(),
                "This is a JSON file."
        )
예제 #3
0
 def test_type_to_data_file(self):
     cache_dir = datamine_cache_dir()
     for hotpot_qa_type in HotpotQAType:
         path = type_to_data_file(hotpot_qa_type)
         self.assertTrue(path.startswith(cache_dir))
         self.assertIn("HOTPOT_QA", path)
         path = os.path.basename(path)
         self.assertIn(hotpot_qa_type.name.lower(), path)
예제 #4
0
    def setUp(self):
        self.setUpPyfakefs()

        dataset_dir = os.path.join(datamine_cache_dir(),
                                   Collection.ALLEN_AI_DROP.name,
                                   "drop_dataset")
        os.makedirs(dataset_dir, mode=0o755)
        self.file_path = os.path.join(dataset_dir, "drop_dataset_dev.json")
예제 #5
0
    def setUp(self):
        self.FAKE_DATASET = Collection.RACE
        self.CACHE_DIR = datamine_cache_dir()

        config_dir = os.path.join(PROJECT_ROOT, "zookeeper", "config")

        # Read real (prod) configuration.
        config_str = None
        with open(os.path.join(config_dir, "config.json"), "rt") as f:
            config_str = f.read()

        # Read real (prod) configuraation schema.
        config_schema_str = None
        with open(os.path.join(config_dir, "config_schema.json"), "rt") as f:
            config_schema_str = f.read()

        self.setUpPyfakefs()

        if not os.path.isdir(config_dir):
            os.makedirs(config_dir, mode=0o755)

        # Write the real configuration to the fake FS.
        with open(os.path.join(config_dir, "config.json"), "wt") as g:
            g.write(config_str)
            g.flush()

        # Write the real configuration schema to the fake FS.
        with open(os.path.join(config_dir, "config_schema.json"), "wt") as g:
            g.write(config_schema_str)
            g.flush()

        del config_dir
        del config_str
        del config_schema_str

        # Write the fake files from the dataset:
        #  * file1.txt
        #  * file2.txt
        #  * data/file3.txt
        dataset_dir = os.path.join(self.CACHE_DIR, self.FAKE_DATASET.name)
        if not os.path.isdir(dataset_dir):
            os.makedirs(dataset_dir, mode=0o755)
        to_write = [
                ("file1.txt", "What a beautiful day!"),
                ("file2.txt", "Thank you God!\n"),
                ("data/file3.txt", "Within some inner directory."),
        ]
        for path, contents in to_write:
            path = os.path.join(dataset_dir, path)
            if not os.path.isdir(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path), mode=0o755)
            with open(path, "wt") as g:
                g.write(contents)
                g.flush()

        self.FILE1_SHA = "51a1155af5ffb7bc8daeaca2a6750b065f48ecb0fbaff3daefc014c7a66206c9"  # noqa: E501
        self.FILE2_SHA = "67519f882c41fab92d71c951a1587ba4a46684031f03bc9b2e58c9a9e36c23fb"  # noqa: E501
        self.FILE3_SHA = "55287e7941404e7bfe281f7c2f6dea75685f32329a5682a153229d43f7335532"  # noqa: E501
예제 #6
0
 def test_exception_raised_if_url_not_reachable(self, mock_config):
     # We also check that the dataset directory is not created if existing.
     os.makedirs(
             os.path.join(datamine_cache_dir(), self.FAKE_DATASET.name),
             mode=0o755
     )
     mock_config.return_value = self.FAKE_CONFIG
     with self.assertRaises(Exception):
         download_dataset(Collection.RACE, lambda _: False)
예제 #7
0
 def test_type_to_data_file(self):
     expected_results = {
         CosmosQAType.TRAIN: "train.jsonl",
         CosmosQAType.DEV: "valid.jsonl",
         CosmosQAType.TEST: "test.jsonl"
     }
     cache_dir = datamine_cache_dir()
     for cosmos_qa_type in CosmosQAType:
         path = type_to_data_file(cosmos_qa_type)
         self.assertTrue(path.startswith(cache_dir))
         self.assertIn("COSMOS_QA", path)
         path = os.path.basename(path)
         self.assertEqual(path, expected_results[cosmos_qa_type])
예제 #8
0
 def test_type_to_data_directory(self):
     DATA_DIR = os.path.join(datamine_cache_dir(), "RACE")
     self.assertEqual(type_to_data_directory(RACEType.TRAIN_MIDDLE),
                      os.path.join(DATA_DIR, "RACE", "train", "middle"))
     self.assertEqual(type_to_data_directory(RACEType.DEV_MIDDLE),
                      os.path.join(DATA_DIR, "RACE", "dev", "middle"))
     self.assertEqual(type_to_data_directory(RACEType.TEST_MIDDLE),
                      os.path.join(DATA_DIR, "RACE", "test", "middle"))
     self.assertEqual(type_to_data_directory(RACEType.TRAIN_HIGH),
                      os.path.join(DATA_DIR, "RACE", "train", "high"))
     self.assertEqual(type_to_data_directory(RACEType.DEV_HIGH),
                      os.path.join(DATA_DIR, "RACE", "dev", "high"))
     self.assertEqual(type_to_data_directory(RACEType.TEST_HIGH),
                      os.path.join(DATA_DIR, "RACE", "test", "high"))
예제 #9
0
def download_dataset(dataset_id, integrity_check):
    """
    Downloads a dataset identified by it's dataset ID (Collection).

    The maybe already downloaded local copy is checked for integrity
    according to the specified integrity check. If the local version is up to
    date, then nothing is done. Otherwise, the dataset is downloaded.

    Returns a code (int): with the following semantics:
    * 1: dataset is available locally and the integrity check passed;
    * 2: the dataset has been downloaded (was not available locally).
    """
    assert (isinstance(dataset_id, Collection))
    if integrity_check(dataset_id):  # Dataset is already downloaded.
        return 1
    msg.info("Downloading {} ...".format(dataset_id.name))
    config = load_datasets_config()[dataset_id.name]
    dataset_dir = os.path.join(datamine_cache_dir(), dataset_id.name)
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir, mode=0o755)

    # Download all the requirements.
    for requirement in config["requirements"]:
        url = requirement["URL"]
        expected_sha256 = requirement["SHA256"]

        # Attempt to guess the filename from the URL. In the future,
        # if it is required, we may have another field in the requirements.
        filename = url_to_filename(url)
        assert (filename is not None and len(filename) > 0)
        filepath = os.path.join(dataset_dir, filename)

        download_file_if_missing(url,
                                 filepath,
                                 expected_sha256=expected_sha256,
                                 desc="Downloading {}".format(filename))
        assert (os.path.isfile(filepath))

        # Unpack the file if it is archived or compressed.
        if is_archive(filepath):
            msg.info("Unpacking {} ...".format(filename))
            extract_archive(filepath, outdir=dataset_dir)
    msg.info("{} has been downloaded.".format(dataset_id.name))
    return 2
예제 #10
0
def download():
    assert (len(sys.argv) >= 1)
    assert (sys.argv[0] == "data_mine download")

    if len(sys.argv) != 2:
        msg.error("Usage: python -m data_mine download <dataset_name>",
                  exits=1)  # noqa: E501
    dataset_name = sys.argv[1]
    if dataset_name not in set([x.name for x in Collection]):
        msg.error("Invalid dataset: {}".format(dataset_name))
        msg.info("Available datasets:")
        msg.info("\n".join(sorted([x.name for x in Collection])), exits=1)

    dataset_id = Collection.from_str(dataset_name)
    msg.info("Checking if {} is already downloaded ...".format(dataset_name))
    return_code = download_dataset(dataset_id, check_deep_integrity)
    if return_code == 1:
        msg.info("{} already available at: {}".format(
            dataset_name, os.path.join(datamine_cache_dir(), dataset_name)))
예제 #11
0
import os

from data_mine.utils import datamine_cache_dir

DROP_CACHE_DIR = os.path.join(datamine_cache_dir(), "ALLEN_AI_DROP")
예제 #12
0
 def setUp(self):
     self.setUpPyfakefs()
     self.dataset_dir = os.path.join(datamine_cache_dir(), "ALLEN_AI_OBQA",
                                     "OpenBookQA-V1-Sep2018", "Data",
                                     "Main")
     os.makedirs(self.dataset_dir, mode=0o755)
예제 #13
0
 def test_retrieved_facts(self, mock_download_dataset):
     facts_file = os.path.join(datamine_cache_dir(), "ALLEN_AI_OBQA",
                               "extracted_facts.json")  # noqa: E501
     with open(facts_file, "wt") as g:
         json.dump(RETRIEVED_FACTS, g)
     self.write_questions(OBQAType.TRAIN, [GOOD_QUESTION1, GOOD_QUESTION2])
     df = dm.ALLEN_AI_OBQA(OBQAType.TRAIN, with_retrieved_facts=True)
     expected_df = pd.DataFrame(
         json.loads("""[
         {
             "id": "7-980",
             "question": "The sun is responsible for",
             "answers": [
                 "puppies learning new tricks",
                 "children growing up and getting old",
                 "flowers wilting in a vase",
                 "plants sprouting, blooming and wilting"
             ],
             "correct": "D",
             "retrieved_facts": [
                 {
                     "context": "Context 1",
                     "token_based": [
                         "tb11",
                         "tb12"
                     ],
                     "vector_based": [
                         "vb1"
                     ]
                 },
                 {
                     "context": "Context 2",
                     "token_based": [
                         "tb21",
                         "tb22"
                     ],
                     "vector_based": [
                         "vb2"
                     ]
                 },
                 {
                     "context": "Context 3",
                     "token_based": [
                         "tb31",
                         "tb32"
                     ],
                     "vector_based": [
                         "vb3"
                     ]
                 },
                 {
                     "context": "Context 4",
                     "token_based": [
                         "tb41",
                         "tb42"
                     ],
                     "vector_based": [
                         "vb4"
                     ]
                 }
             ]
         },
         {
             "id": "1158",
             "question": "Which product cannot convert energy into light?",
             "answers": [
                 "chandelier",
                 "charger",
                 "floor lamp",
                 "Christmas tree lights"
             ],
             "correct": "B",
             "retrieved_facts": [
                 {
                     "context": "Context 8",
                     "token_based": [
                         "tb81",
                         "tb82"
                     ],
                     "vector_based": [
                         "vb8"
                     ]
                 },
                 {
                     "context": "Context 6",
                     "token_based": [
                         "tb61",
                         "tb62"
                     ],
                     "vector_based": [
                         "vb6"
                     ]
                 },
                 {
                     "context": "Context 5",
                     "token_based": [
                         "tb51",
                         "tb52"
                     ],
                     "vector_based": [
                         "vb5"
                     ]
                 },
                 {
                     "context": "Context 7",
                     "token_based": [
                         "tb71",
                         "tb72"
                     ],
                     "vector_based": [
                         "vb7"
                     ]
                 }
             ]
         }
         ]"""))
     pd.testing.assert_frame_equal(df, expected_df)
     mock_download_dataset.assert_called_once_with(Collection.ALLEN_AI_OBQA,
                                                   ANY)  # noqa: E501
예제 #14
0
    def setUp(self):
        self.setUpPyfakefs()

        dataset_dir = os.path.join(datamine_cache_dir(), "COSMOS_QA")
        os.makedirs(dataset_dir, mode=0o755)
예제 #15
0
import os

from data_mine.utils import datamine_cache_dir

RACE_CACHE_DIR = os.path.join(datamine_cache_dir(), "RACE")
예제 #16
0
import os

from data_mine.utils import datamine_cache_dir

ARC_CACHE_DIR = os.path.join(datamine_cache_dir(), "ALLEN_AI_ARC")
예제 #17
0
import os

from data_mine.utils import datamine_cache_dir

OBQA_CACHE_DIR = os.path.join(datamine_cache_dir(), "ALLEN_AI_OBQA")
예제 #18
0
import os

from data_mine.utils import datamine_cache_dir

HOTPOT_QA_CACHE_DIR = os.path.join(datamine_cache_dir(), "HOTPOT_QA")
예제 #19
0
import os

from data_mine.utils import datamine_cache_dir

COSMOS_QA_CACHE_DIR = os.path.join(datamine_cache_dir(), "COSMOS_QA")