示例#1
0
    def __init__(self,):
        sha1sum_id = "652a17f1315ec0961336aa140cf983776400c933"
        dataset = "san_francisco_airbnb"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        for img_col in self.image_columns:
            self._train_df[img_col] = self._train_df[img_col].apply(
                lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            self._test_df[img_col] =\
                self._test_df[img_col].apply(
                    lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            print(self._train_df[img_col][0])
            print(self._test_df[img_col][0])

        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")
示例#2
0
    def __init__(self,):
        sha1sum_id = "2aae657b786f505004ac2922b66097d60a540a58"
        dataset = "hateful_memes"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        for img_col in self.image_columns:
            self._train_df[img_col] = self._train_df[img_col].apply(
                lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            self._test_df[img_col] =\
                self._test_df[img_col].apply(
                    lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            print(self._train_df[img_col][0])
            print(self._test_df[img_col][0])
        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")
示例#3
0
    def __init__(self,):
        sha1sum_id = "72cb19612318bb304d4a169804f525f88dc3f0d0"
        dataset = "petfinder"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        for img_col in self.image_columns:
            self._train_df[img_col] = self._train_df[img_col].apply(
                lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            self._test_df[img_col] =\
                self._test_df[img_col].apply(
                    lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            print(self._train_df[img_col][0])
            print(self._test_df[img_col][0])

        _, self._train_df = train_test_split(
            self._train_df,
            test_size=0.1,
            random_state=np.random.RandomState(123),
            stratify=self._train_df[self.label_columns[0]],
        )
        _, self._test_df = train_test_split(
            self._test_df,
            test_size=0.1,
            random_state=np.random.RandomState(123),
            stratify=self._test_df[self.label_columns[0]],
        )
        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")
示例#4
0
    def __init__(
        self,
    ):
        sha1sum_id = "9c701aa6fc42ec3fe429bfe85a8dac4532ab9fcd"
        dataset = "amazon_review_sentiment_cross_lingual"
        file_name = f"{dataset}.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=get_data_home_dir(),
        )
        self._train_en_df = pd.read_csv(
            os.path.join(self._path, "en_train.tsv"),
            sep="\t",
            header=None,
            names=["label", "text"],
        ).sample(1000, random_state=123)

        self._test_en_df = pd.read_csv(
            os.path.join(self._path, "en_test.tsv"),
            sep="\t",
            header=None,
            names=["label", "text"],
        ).sample(200, random_state=123)

        self._train_en_df.reset_index(drop=True, inplace=True)
        self._test_en_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_en_df)}")
        print(f"test sample num: {len(self._test_en_df)}")
示例#5
0
    def __init__(self,):
        sha1sum_id = "8c2a25555c49ef2b30545004488022465808d03f"
        dataset = "ae"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")