def get_shuffled_second_text(split_df): # Take in a df of first and second texts, shuffled it, shifts the second text only by one and then outputs both texts # This ensures that we have randomly shuffled pairs of texts that are always paired with a different other text # Do the shuffle split_df = split_df.sample(frac=1, random_state = get_random_seed()) # Shift all second texts along one new_second_texts = split_df.second_text.shift(1) new_second_texts.iloc[0] = split_df.second_text.iloc[-1] # Make this shifted second_text the new second_text (I.e. it is not matched with first_text and more) split_df["second_text"] = new_second_texts return split_df
def get_cifar10_for_automl(batch_size=512, num_workers=4, augment=False, random_seed=None): ''' ''' #data_dir = data_dir if data_dir is not None else CIFAR_DATA_PATH data_dir = str(CIFAR_DATA_PATH) ## val_size = 0.25 if random_seed is None: random_seed = get_random_seed() trainloader, valloader = get_train_val_loader(data_dir, batch_size, augment, device, random_seed, val_size=val_size, shuffle=True, show_sample=False, num_workers=num_workers) ## batch_size_test = 1024 testloader = get_test_loader(data_dir, batch_size_test, device, shuffle=True, num_workers=num_workers) return trainloader, valloader, testloader
def get_embeds(self): df = self.get_dataset(dataset_name=self.test_dataset, app_name=self.app_name) # We remove stopwords and lowercase vectorizer = CountVectorizer(stop_words="english", lowercase=True) bow_embed = vectorizer.fit_transform(df.text.str.lower()) lda = LatentDirichletAllocation(n_components=self.get_embedding_size(), max_iter=5, learning_method='online', learning_offset=50., random_state=get_random_seed()) embeddings = lda.fit_transform(bow_embed) return embeddings, df
def get_cifar10_for_data_point_mdl_gen(batch_size=512, num_workers=4, augment=False, random_seed=None): ''' Gets the data sets for training the models that will be data points ''' #data_dir = data_dir if data_dir is not None else CIFAR_DATA_PATH data_dir = str(CIFAR_DATA_PATH) #print(f'--> {data_dir}') ## val_size = 0.1 random_seed = get_random_seed() if random_seed is None else random_seed #st() trainloader, valloader = get_train_val_loader(data_dir, batch_size, augment, device, random_seed, val_size=val_size, shuffle=True, show_sample=False, num_workers=num_workers) #st() ## batch_size_test = 1024 testloader = get_test_loader(data_dir, batch_size_test, device, shuffle=True, num_workers=num_workers) return trainloader, valloader, testloader
def get_single_dataset(dataset): DOWNLOADER_DICT = { "guzman_2015": Guzman2015, "maalej_2016": Maalej2016, "williams_2017": Williams2017, "chen_2014": Chen2014, "di_sorbo_2016": DiSorbo2016, "scalabrino_2017": Scalabrino2017, "jha_2017": Jha2017, "tizard_2019": Tizard2019, "ciurumelea_2017": Ciurumelea2017, } DOWNLOAD_DIR = os.path.join(".", "data", "raw") if not os.path.exists(DOWNLOAD_DIR): os.mkdir(DOWNLOAD_DIR) dataset_download_dir = os.path.join(DOWNLOAD_DIR, dataset) if not os.path.exists(dataset_download_dir): assert dataset in DOWNLOADER_DICT.keys(), f"{dataset} is not supported. Please create a folder with the path {dataset_download_dir} and add your own APP_NAME.csv files with 'text' and 'label' columns to use your own data." downloader = DOWNLOADER_DICT[dataset](random_state = get_random_seed(), download_dir = dataset_download_dir) print(f"Downloading {dataset}") downloader.download()