예제 #1
0
 def download(self):
     if self._check_integrity():
         print('Files already downloaded and verified')
         return
     download_and_extract_archive(self.url,
                                  self.root,
                                  filename=self.filename,
                                  md5=self.tgz_md5)
예제 #2
0
    def __init__(self,
                 root,
                 train=True,
                 transform=None,
                 target_transform=None):
        super(Cifar100, self).__init__(root,
                                       transform=transform,
                                       target_transform=target_transform)

        base_folder = 'cifar-100-python'
        url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
        filename = "cifar-100-python.tar.gz"
        tgz_md5 = 'eb9058c3a382ffc7106e4002c42a8d85'
        train_list = [
            ['train', '16019d7e3df5f24257cddd939b257f8d'],
        ]
        test_list = [
            ['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'],
        ]
        meta = {
            'filename': 'meta',
            'key': 'fine_label_names',
            'md5': '7973b15100ade9c7d40fb424638fde48',
        }

        targets = []
        data = []
        self.train = train

        download_and_extract_archive(url,
                                     self.root,
                                     filename=filename,
                                     md5=tgz_md5)

        if self.train:
            downloaded_list = train_list
        else:
            downloaded_list = test_list

        for file_name, checksum in downloaded_list:
            file_path = os.path.join(self.root, base_folder, file_name)

            with open(file_path, 'rb') as f:
                entry = pickle.load(f, encoding='latin1')
                #img = pil_loader(file_path)
                data.extend(entry['data'])

                data = np.vstack(data).reshape(-1, 3, 32, 32)
                data = data.transpose((0, 2, 3, 1))

                if 'labels' in entry:
                    targets.extend(entry['labels'])
                else:
                    targets.extend(entry['fine_labels'])

        self.data = data
        self.targets = targets
예제 #3
0
    def download(self) -> None:
        """Download the EMNIST data if it doesn't exist in processed_folder already."""
        import shutil

        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)
        os.makedirs(self.processed_folder, exist_ok=True)

        # download files
        print('Downloading and extracting zip archive')
        download_and_extract_archive(self.url,
                                     download_root=self.raw_folder,
                                     filename="emnist.zip",
                                     remove_finished=True,
                                     md5=self.md5)
        gzip_folder = os.path.join(self.raw_folder, 'gzip')
        for gzip_file in os.listdir(gzip_folder):
            if gzip_file.endswith('.gz'):
                extract_archive(os.path.join(gzip_folder, gzip_file),
                                gzip_folder)

        # process and save as torch files
        for split in self.splits:
            print('Processing ' + split)
            training_set = (
                read_image_file(
                    os.path.join(
                        gzip_folder,
                        'emnist-{}-train-images-idx3-ubyte'.format(split))),
                read_label_file(
                    os.path.join(
                        gzip_folder,
                        'emnist-{}-train-labels-idx1-ubyte'.format(split))))
            test_set = (
                read_image_file(
                    os.path.join(
                        gzip_folder,
                        'emnist-{}-test-images-idx3-ubyte'.format(split))),
                read_label_file(
                    os.path.join(
                        gzip_folder,
                        'emnist-{}-test-labels-idx1-ubyte'.format(split))))
            with open(
                    os.path.join(self.processed_folder,
                                 self._training_file(split)), 'wb') as f:
                torch.save(training_set, f)
            with open(
                    os.path.join(self.processed_folder,
                                 self._test_file(split)), 'wb') as f:
                torch.save(test_set, f)
        shutil.rmtree(gzip_folder)

        print('Done!')
예제 #4
0
    def download(self) -> None:
        """Download the MNIST data if it doesn't exist in processed_folder already."""

        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)
        os.makedirs(self.processed_folder, exist_ok=True)

        # download files
        for url, md5 in self.resources:
            filename = url.rpartition('/')[2]
            download_and_extract_archive(url,
                                         download_root=self.raw_folder,
                                         filename=filename,
                                         md5=md5)

        # process and save as torch files
        print('Processing...')

        training_set = (read_image_file(
            os.path.join(self.raw_folder, 'train-images-idx3-ubyte')),
                        read_label_file(
                            os.path.join(self.raw_folder,
                                         'train-labels-idx1-ubyte')))
        test_set = (read_image_file(
            os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')),
                    read_label_file(
                        os.path.join(self.raw_folder,
                                     't10k-labels-idx1-ubyte')))
        with open(os.path.join(self.processed_folder, self.training_file),
                  'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.processed_folder, self.test_file),
                  'wb') as f:
            torch.save(test_set, f)

        print('Done!')
예제 #5
0
# Append parent dir to sys path.
parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
os.sys.path.insert(0, parentdir)

MODEL_URL = 'https://s3.eu-central-1.amazonaws.com/nlp-cube/{}.tar.gz'
MODEL_LOCATION = 'corpus/trained_models/{}'

EMBEDDINGS_NAME = 'wiki.{}.vec'
FACEBOOK_EMBEDDINGS_URL = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/'
FACEBOOK_EMBEDDINGS_LOCATION = 'corpus/'

if __name__ == '__main__':
    parser = optparse.OptionParser()
    parser.add_option('--language', action='store', dest='language')
    (params, _) = parser.parse_args(sys.argv)
    if not params.language:
        print('\nRun the script in the following manner:\n'
              'python scripts/download_model.py --language ro\n')
        sys.exit(1)

    # Download Facebook embeddings for the provided language.
    name = EMBEDDINGS_NAME.format(params.language)
    language_url = FACEBOOK_EMBEDDINGS_URL + name
    location = FACEBOOK_EMBEDDINGS_LOCATION + name
    download_file(language_url, location)

    # Download model from S3, for provided language
    model_url = MODEL_URL.format(params.language)
    model_location = MODEL_LOCATION.format(params.language)
    download_and_extract_archive(model_url, model_location)
예제 #6
0
EMBEDDINGS_NAME = 'wiki.{}.vec'
FACEBOOK_EMBEDDINGS_URL = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/'
FACEBOOK_EMBEDDINGS_LOCATION = 'corpus/'

logger = logging.getLogger(__name__)

if __name__ == '__main__':
    parser = optparse.OptionParser()
    parser.add_option('--language', action='store', dest='language')
    (params, _) = parser.parse_args(sys.argv)
    if not params.language:
        print('\nRun the script in the following manner:\n'
              'python scripts/download_data.py --language ro\n')
        sys.exit(1)

    # Download Treebank CONLL Universal Dependencies data.
    download_and_extract_archive(TREEBANK_CONLL, TREEBANK_LOCATION)

    # Download test CONLL Universal Dependencies data.
    download_and_extract_archive(TEST_DATA_CONLL, TEST_DATA_LOCATION)

    # Download conll17_ud_eval script
    download_file(UD_EVAL_SCRIPT, UD_EVAL_LOCATION)

    # Download Facebook embeddings for the provided language.
    name = EMBEDDINGS_NAME.format(params.language)
    language_url = FACEBOOK_EMBEDDINGS_URL + name
    location = FACEBOOK_EMBEDDINGS_LOCATION + name
    download_file(language_url, location)