def __init__(self,): sha1sum_id = "652a17f1315ec0961336aa140cf983776400c933" dataset = "san_francisco_airbnb" file_name = f"{dataset}_for_unit_tests.zip" url = get_repo_url() + file_name save_path = os.path.join(get_data_home_dir(), file_name) self._path = os.path.join(get_data_home_dir(), dataset) download( url=url, path=save_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( save_path, sha1_hash=sha1sum_id, folder=self._path, ) self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0) self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0) for img_col in self.image_columns: self._train_df[img_col] = self._train_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) self._test_df[img_col] =\ self._test_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) print(self._train_df[img_col][0]) print(self._test_df[img_col][0]) self._train_df.reset_index(drop=True, inplace=True) self._test_df.reset_index(drop=True, inplace=True) print(f"train sample num: {len(self._train_df)}") print(f"test sample num: {len(self._test_df)}")
def __init__(self,): sha1sum_id = "2aae657b786f505004ac2922b66097d60a540a58" dataset = "hateful_memes" file_name = f"{dataset}_for_unit_tests.zip" url = get_repo_url() + file_name save_path = os.path.join(get_data_home_dir(), file_name) self._path = os.path.join(get_data_home_dir(), dataset) download( url=url, path=save_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( save_path, sha1_hash=sha1sum_id, folder=self._path, ) self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0) self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0) for img_col in self.image_columns: self._train_df[img_col] = self._train_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) self._test_df[img_col] =\ self._test_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) print(self._train_df[img_col][0]) print(self._test_df[img_col][0]) self._train_df.reset_index(drop=True, inplace=True) self._test_df.reset_index(drop=True, inplace=True) print(f"train sample num: {len(self._train_df)}") print(f"test sample num: {len(self._test_df)}")
def __init__(self,): sha1sum_id = "72cb19612318bb304d4a169804f525f88dc3f0d0" dataset = "petfinder" file_name = f"{dataset}_for_unit_tests.zip" url = get_repo_url() + file_name save_path = os.path.join(get_data_home_dir(), file_name) self._path = os.path.join(get_data_home_dir(), dataset) download( url=url, path=save_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( save_path, sha1_hash=sha1sum_id, folder=self._path, ) self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0) self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0) for img_col in self.image_columns: self._train_df[img_col] = self._train_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) self._test_df[img_col] =\ self._test_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) print(self._train_df[img_col][0]) print(self._test_df[img_col][0]) _, self._train_df = train_test_split( self._train_df, test_size=0.1, random_state=np.random.RandomState(123), stratify=self._train_df[self.label_columns[0]], ) _, self._test_df = train_test_split( self._test_df, test_size=0.1, random_state=np.random.RandomState(123), stratify=self._test_df[self.label_columns[0]], ) self._train_df.reset_index(drop=True, inplace=True) self._test_df.reset_index(drop=True, inplace=True) print(f"train sample num: {len(self._train_df)}") print(f"test sample num: {len(self._test_df)}")
def __init__( self, ): sha1sum_id = "9c701aa6fc42ec3fe429bfe85a8dac4532ab9fcd" dataset = "amazon_review_sentiment_cross_lingual" file_name = f"{dataset}.zip" url = get_repo_url() + file_name save_path = os.path.join(get_data_home_dir(), file_name) self._path = os.path.join(get_data_home_dir(), dataset) download( url=url, path=save_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( save_path, sha1_hash=sha1sum_id, folder=get_data_home_dir(), ) self._train_en_df = pd.read_csv( os.path.join(self._path, "en_train.tsv"), sep="\t", header=None, names=["label", "text"], ).sample(1000, random_state=123) self._test_en_df = pd.read_csv( os.path.join(self._path, "en_test.tsv"), sep="\t", header=None, names=["label", "text"], ).sample(200, random_state=123) self._train_en_df.reset_index(drop=True, inplace=True) self._test_en_df.reset_index(drop=True, inplace=True) print(f"train sample num: {len(self._train_en_df)}") print(f"test sample num: {len(self._test_en_df)}")
def __init__(self,): sha1sum_id = "8c2a25555c49ef2b30545004488022465808d03f" dataset = "ae" file_name = f"{dataset}_for_unit_tests.zip" url = get_repo_url() + file_name save_path = os.path.join(get_data_home_dir(), file_name) self._path = os.path.join(get_data_home_dir(), dataset) download( url=url, path=save_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( save_path, sha1_hash=sha1sum_id, folder=self._path, ) self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0) self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0) self._train_df.reset_index(drop=True, inplace=True) self._test_df.reset_index(drop=True, inplace=True) print(f"train sample num: {len(self._train_df)}") print(f"test sample num: {len(self._test_df)}")