Exemplo n.º 1
0
    def _download_and_extract(self, key, url, download_folder):
        file_type = key.split("_")[0]
        os.makedirs(download_folder, exist_ok=True)
        local_filename = url.split("/")[-1]
        extraction_folder = os.path.join(download_folder,
                                         local_filename.split(".")[0])
        local_filename = os.path.join(download_folder, local_filename)

        if (os.path.exists(local_filename)
                or (os.path.exists(extraction_folder)
                    and len(os.listdir(extraction_folder))) != 0):
            self.writer.write(
                "{} {} already present. Skipping download.".format(
                    self.dataset_proper_name, file_type))
            return extraction_folder

        self.writer.write("Downloading the {} {} now.".format(
            self.dataset_proper_name, file_type))
        download(url, download_folder, url.split("/")[-1])

        self.writer.write(
            "Extracting the {} {} now. This may take time".format(
                self.dataset_proper_name, file_type))
        decompress(download_folder, url.split("/")[-1])

        return extraction_folder
Exemplo n.º 2
0
    def _download_and_extract(self, key, url, download_folder):
        file_type = key.split("_")[0]
        os.makedirs(download_folder, exist_ok=True)
        local_filename = url.split("/")[-1]
        extraction_folder = os.path.join(download_folder, local_filename.split(".")[0])
        local_filename = os.path.join(download_folder, local_filename)

        if (
            os.path.exists(local_filename)
            or (
                os.path.exists(extraction_folder) and len(os.listdir(extraction_folder))
            )
            != 0
        ):
            logger.info(
                f"{self.dataset_proper_name} {file_type} already present. "
                + "Skipping download."
            )
            return extraction_folder

        logger.info(f"Downloading the {self.dataset_proper_name} {file_type} now.")
        download(url, download_folder, url.split("/")[-1])

        logger.info(
            f"Extracting the {self.dataset_proper_name} {file_type} now. "
            + "This may take time"
        )
        decompress(download_folder, url.split("/")[-1])

        return extraction_folder
Exemplo n.º 3
0
    def convert(self):
        config = self.configuration.get_config()
        data_dir = config.env.data_dir

        if self.args.mmf_data_folder:
            data_dir = self.args.mmf_data_folder

        bypass_checksum = False
        if self.args.bypass_checksum:
            bypass_checksum = bool(self.args.bypass_checksum)

        print(f"Data folder is {data_dir}")
        print(f"Zip path is {self.args.zip_file}")

        base_path = os.path.join(data_dir, "datasets", "hateful_memes",
                                 "defaults")

        images_path = os.path.join(base_path, "images")
        PathManager.mkdirs(images_path)

        if not bypass_checksum:
            self.checksum(self.args.zip_file, self.POSSIBLE_CHECKSUMS)

        src = self.args.zip_file
        print(f"Moving {src}")
        dest = images_path
        move(src, dest)

        print(f"Unzipping {src}")
        self.decompress_zip(dest,
                            fname=os.path.basename(src),
                            password=self.args.password)

        self.assert_files(images_path)

        annotations_path = os.path.join(base_path, "annotations")
        PathManager.mkdirs(annotations_path)
        annotations = self.JSONL_FILES

        for annotation in annotations:
            print(f"Moving {annotation}")
            src = os.path.join(images_path, "data", annotation)
            dest = annotations_path
            move(src, dest)

        images = self.IMAGE_FILES

        for image_file in images:
            src = os.path.join(images_path, "data", image_file)
            if PathManager.exists(src):
                print(f"Moving {image_file}")
            else:
                continue
            dest = images_path
            move(src, dest)
            if src.endswith(".tar.gz"):
                decompress(dest, fname=image_file, delete_original=False)
Exemplo n.º 4
0
    def convert(self):
        config = self.configuration.get_config()
        data_dir = config.env.data_dir

        if self.args.mmf_data_folder:
            data_dir = self.args.mmf_data_folder

        bypass_checksum = False
        if self.args.bypass_checksum:
            bypass_checksum = bool(self.args.bypass_checksum)

        print(f"Data folder is {data_dir}")
        print(f"Zip path is {self.args.zip_file}")

        base_path = data_dir

        images_path = os.path.join(base_path, "images")
        PathManager.mkdirs(images_path)

        move_dir = False
        if self.args.move:
            move_dir = bool(self.args.move)

        if not bypass_checksum:
            self.checksum(self.args.zip_file, self.POSSIBLE_CHECKSUMS)

        src = self.args.zip_file
        dest = images_path
        if move_dir:
            print(f"Moving {src}")
            move(src, dest)
        else:
            print(f"Copying {src}")
            copy(src, dest)

        print(f"Unzipping {src}")
        self.decompress_zip(dest,
                            fname=os.path.basename(src),
                            password=self.args.password)

        phase_one = self.assert_files(images_path)

        annotations_path = os.path.join(base_path, "annotations")
        PathManager.mkdirs(annotations_path)
        annotations = (self.JSONL_PHASE_ONE_FILES
                       if phase_one is True else self.JSONL_PHASE_TWO_FILES)

        for annotation in annotations:
            print(f"Moving {annotation}")
            src = os.path.join(images_path, "data", annotation)
            dest = os.path.join(annotations_path, annotation)
            move(src, dest)

        images = self.IMAGE_FILES

        for image_file in images:
            src = os.path.join(images_path, "data", image_file)
            if PathManager.exists(src):
                print(f"Moving {image_file}")
            else:
                continue
            dest = os.path.join(images_path, image_file)
            move(src, dest)
            if src.endswith(".tar.gz"):
                decompress(dest, fname=image_file, delete_original=False)