예제 #1
0
def test_load_old_checkpoint(cls):
    dataset = AmazonReviewSentimentCrossLingualDataset()
    sha1sum_id = "4ba096cdf6bd76c06386f2c27140db055e59c91b"
    checkpoint_name = "mdeberta-v3-base-checkpoint"
    save_path = os.path.join(get_home_dir(), "checkpoints")
    file_path = os.path.join(save_path, f"{checkpoint_name}.zip")
    checkpoint_path = os.path.join(get_home_dir(), "checkpoints",
                                   checkpoint_name)
    if os.path.exists(save_path):
        shutil.rmtree(save_path)
    download(
        url=
        f"s3://automl-mm-bench/unit-tests-0.4/checkpoints/{checkpoint_name}.zip",
        path=file_path,
        sha1_hash=sha1sum_id,
    )
    protected_zip_extraction(
        file_path,
        sha1_hash=sha1sum_id,
        folder=save_path,
    )
    predictor = cls.load(checkpoint_path)
    verify_predictor_save_load(predictor, dataset.test_df, cls=cls)

    # continuous training
    predictor.fit(
        dataset.train_df,
        presets="multilingual",
        time_limit=10,
        hyperparameters={"optimization.top_k_average_method": "uniform_soup"},
    )
    verify_predictor_save_load(predictor, dataset.test_df, cls=cls)
예제 #2
0
    def __init__(self,):
        sha1sum_id = "652a17f1315ec0961336aa140cf983776400c933"
        dataset = "san_francisco_airbnb"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        for img_col in self.image_columns:
            self._train_df[img_col] = self._train_df[img_col].apply(
                lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            self._test_df[img_col] =\
                self._test_df[img_col].apply(
                    lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            print(self._train_df[img_col][0])
            print(self._test_df[img_col][0])

        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")
예제 #3
0
    def __init__(self,):
        sha1sum_id = "2aae657b786f505004ac2922b66097d60a540a58"
        dataset = "hateful_memes"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        for img_col in self.image_columns:
            self._train_df[img_col] = self._train_df[img_col].apply(
                lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            self._test_df[img_col] =\
                self._test_df[img_col].apply(
                    lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            print(self._train_df[img_col][0])
            print(self._test_df[img_col][0])
        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")
예제 #4
0
    def __init__(self,):
        sha1sum_id = "72cb19612318bb304d4a169804f525f88dc3f0d0"
        dataset = "petfinder"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        for img_col in self.image_columns:
            self._train_df[img_col] = self._train_df[img_col].apply(
                lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            self._test_df[img_col] =\
                self._test_df[img_col].apply(
                    lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            print(self._train_df[img_col][0])
            print(self._test_df[img_col][0])

        _, self._train_df = train_test_split(
            self._train_df,
            test_size=0.1,
            random_state=np.random.RandomState(123),
            stratify=self._train_df[self.label_columns[0]],
        )
        _, self._test_df = train_test_split(
            self._test_df,
            test_size=0.1,
            random_state=np.random.RandomState(123),
            stratify=self._test_df[self.label_columns[0]],
        )
        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")
예제 #5
0
    def __init__(
        self,
    ):
        sha1sum_id = "9c701aa6fc42ec3fe429bfe85a8dac4532ab9fcd"
        dataset = "amazon_review_sentiment_cross_lingual"
        file_name = f"{dataset}.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=get_data_home_dir(),
        )
        self._train_en_df = pd.read_csv(
            os.path.join(self._path, "en_train.tsv"),
            sep="\t",
            header=None,
            names=["label", "text"],
        ).sample(1000, random_state=123)

        self._test_en_df = pd.read_csv(
            os.path.join(self._path, "en_test.tsv"),
            sep="\t",
            header=None,
            names=["label", "text"],
        ).sample(200, random_state=123)

        self._train_en_df.reset_index(drop=True, inplace=True)
        self._test_en_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_en_df)}")
        print(f"test sample num: {len(self._test_en_df)}")
예제 #6
0
    def __init__(self,):
        sha1sum_id = "8c2a25555c49ef2b30545004488022465808d03f"
        dataset = "ae"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")