Пример #1
0
def get_shuffled_second_text(split_df):
    # Take in a df of first and second texts, shuffled it, shifts the second text only by one and then outputs both texts
    # This ensures that we have randomly shuffled pairs of texts that are always paired with a different other text

    # Do the shuffle
    split_df = split_df.sample(frac=1, random_state = get_random_seed())

    # Shift all second texts along one
    new_second_texts = split_df.second_text.shift(1)
    new_second_texts.iloc[0] = split_df.second_text.iloc[-1]

    # Make this shifted second_text the new second_text (I.e. it is not matched with first_text and more)
    split_df["second_text"] = new_second_texts
    return split_df
def get_cifar10_for_automl(batch_size=512, num_workers=4, augment=False, random_seed=None):
    '''
    '''
    #data_dir = data_dir if data_dir is not None else CIFAR_DATA_PATH
    data_dir = str(CIFAR_DATA_PATH)
    ##
    val_size = 0.25
    if random_seed is None:
        random_seed = get_random_seed()
    trainloader, valloader = get_train_val_loader(data_dir, batch_size, augment, device, random_seed,
                            val_size=val_size, shuffle=True, show_sample=False, num_workers=num_workers)
    ##
    batch_size_test = 1024
    testloader = get_test_loader(data_dir, batch_size_test, device, shuffle=True, num_workers=num_workers)
    return trainloader, valloader, testloader
Пример #3
0
    def get_embeds(self):
        df = self.get_dataset(dataset_name=self.test_dataset,
                              app_name=self.app_name)

        # We remove stopwords and lowercase
        vectorizer = CountVectorizer(stop_words="english", lowercase=True)
        bow_embed = vectorizer.fit_transform(df.text.str.lower())

        lda = LatentDirichletAllocation(n_components=self.get_embedding_size(),
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=get_random_seed())
        embeddings = lda.fit_transform(bow_embed)

        return embeddings, df
Пример #4
0
def get_cifar10_for_data_point_mdl_gen(batch_size=512, num_workers=4, augment=False, random_seed=None):
    '''
    Gets the data sets for training the models that will be data points
    '''
    #data_dir = data_dir if data_dir is not None else CIFAR_DATA_PATH
    data_dir = str(CIFAR_DATA_PATH)
    #print(f'--> {data_dir}')
    ##
    val_size = 0.1
    random_seed = get_random_seed() if random_seed is None else random_seed
    #st()
    trainloader, valloader = get_train_val_loader(data_dir, batch_size, augment, device, random_seed,
                            val_size=val_size, shuffle=True, show_sample=False, num_workers=num_workers)
    #st()
    ##
    batch_size_test = 1024
    testloader = get_test_loader(data_dir, batch_size_test, device, shuffle=True, num_workers=num_workers)
    return trainloader, valloader, testloader
Пример #5
0
def get_single_dataset(dataset):

    DOWNLOADER_DICT = {
        "guzman_2015": Guzman2015,
        "maalej_2016": Maalej2016,
        "williams_2017": Williams2017,
        "chen_2014": Chen2014,
        "di_sorbo_2016": DiSorbo2016,
        "scalabrino_2017": Scalabrino2017,
        "jha_2017": Jha2017,
        "tizard_2019": Tizard2019,
        "ciurumelea_2017": Ciurumelea2017,
    }

    DOWNLOAD_DIR = os.path.join(".", "data", "raw")
    if not os.path.exists(DOWNLOAD_DIR):
        os.mkdir(DOWNLOAD_DIR)

    dataset_download_dir = os.path.join(DOWNLOAD_DIR, dataset)
    if not os.path.exists(dataset_download_dir):
        assert dataset in DOWNLOADER_DICT.keys(), f"{dataset} is not supported. Please create a folder with the path {dataset_download_dir} and add your own APP_NAME.csv files with 'text' and 'label' columns to use your own data."
        downloader = DOWNLOADER_DICT[dataset](random_state = get_random_seed(), download_dir = dataset_download_dir)
        print(f"Downloading {dataset}")
        downloader.download()