def join_cadastre(): """ Join the table basol_cadastre with cadastre table """ # Input datasets basol_cadastre = Dataset("etl", "basol_cadastre") cadastre = Dataset("etl", "cadastre") # Output datasets basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined") basol_cadastre_dtype = basol_cadastre.read_dtype() dtype = [*basol_cadastre_dtype, Column("geog", Geometry(srid=4326))] basol_cadastre_joined.write_dtype(dtype) BasolCadastre = basol_cadastre.reflect() Cadastre = cadastre.reflect() session = basol_cadastre.get_session() cond = (BasolCadastre.commune == Cadastre.commune) & \ (BasolCadastre.section == Cadastre.section) & \ (BasolCadastre.numero == Cadastre.numero) q = session.query(BasolCadastre, Cadastre.geog) \ .join(Cadastre, cond) \ .yield_per(500) with basol_cadastre_joined.get_writer() as writer: for (basol, geog) in q: output_row = {**row2dict(basol), "geog": geog} output_row.pop("id") writer.write_row_dict(output_row) session.close()
def merge_cadastre(): """ Merge the different parcelles into a MultiPolygon """ # Input dataset basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined") # Output dataset basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("numerobasol", String), Column("geog", Geometry(srid=4326)) ] basol_cadastre_merged.write_dtype(dtype) BasolCadastreJoined = basol_cadastre_joined.reflect() session = basol_cadastre_joined.get_session() select = [ BasolCadastreJoined.numerobasol, func.st_multi(func.st_union(BasolCadastreJoined.geog)) ] q = session.query(*select) \ .group_by(BasolCadastreJoined.numerobasol) \ .all() with basol_cadastre_merged.get_writer() as writer: for (numerobasol, geog) in q: row = {"numerobasol": numerobasol, "geog": geog} writer.write_row_dict(row) session.close()
def add_parcels(): """ join table basol_intersected with basol_cadastre_merged """ # input datasets basol_intersected = Dataset("etl", "basol_intersected") basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged") # output datasets basol_with_parcels = Dataset("etl", "basol_with_parcels") BasolIntersected = basol_intersected.reflect() BasolCadastreMerged = basol_cadastre_merged.reflect() dtype = basol_intersected.read_dtype() basol_with_parcels.write_dtype(dtype) session = basol_intersected.get_session() cond = (BasolIntersected.numerobasol == BasolCadastreMerged.numerobasol) q = session.query(BasolIntersected, BasolCadastreMerged.geog) \ .join(BasolCadastreMerged, cond, isouter=True) \ .all() with basol_with_parcels.get_writer() as writer: for (row, geog) in q: if geog is not None: row.geog_precision = precisions.PARCEL row.geog_source = "cadastre" row.geog = geog writer.write_row_dict(row2dict(row)) session.close()
def parse_cadastre(): basol_source = Dataset("etl", "basol_source") basol_cadastre = Dataset("etl", "basol_cadastre") dtype = [ Column("id", BigInteger, primary_key=True, autoincrement=True), Column("numerobasol", String), Column("commune", String), Column("section", String), Column("numero", String) ] basol_cadastre.write_dtype(dtype) with basol_cadastre.get_writer() as writer: for row in basol_source.iter_rows(primary_key="numerobasol"): cadastre_multi = row["cadastre_multi"] if cadastre_multi: parcelles = transformers.parse_cadastre(cadastre_multi) for parcelle in parcelles: output_row = { "numerobasol": row["numerobasol"], **parcelle } writer.write_row_dict(output_row)
def merge_geog(): """ Choose best precision between initial coordinates or geocoded coordinates if geog is not set from cadastre information """ # Input dataset basol_geocoded = Dataset("etl", "basol_normalized") # Output dataset basol_geog_merged = Dataset("etl", "basol_geog_merged") basol_geog_merged.write_dtype([ *basol_geocoded.read_dtype(), Column("geog", Geometry(srid=4326)), Column("geog_precision", String), Column("geog_source", String) ]) BasolGeocoded = basol_geocoded.reflect() session = basol_geocoded.get_session() point_lambert2 = func.ST_Transform( func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.coordxlambertii, BasolGeocoded.coordylambertii), LAMBERT2), WGS84) point_geocoded = func.ST_setSRID( func.ST_MakePoint(BasolGeocoded.geocoded_longitude, BasolGeocoded.geocoded_latitude), WGS84) q = session.query(BasolGeocoded, point_lambert2, point_geocoded).all() with basol_geog_merged.get_writer() as writer: for (row, point_lambert2, point_geocoded) in q: output_row = { **row2dict(row), "geog": None, "geog_precision": None, "geog_source": None } if row.l2e_precision == precisions.HOUSENUMBER: output_row["geog"] = point_lambert2 output_row["geog_precision"] = row.l2e_precision output_row["geog_source"] = "lambert2" elif (row.geocoded_result_type == precisions.HOUSENUMBER) and \ (row.geocoded_result_score >= 0.6): output_row["geog"] = point_geocoded output_row["geog_precision"] = row.geocoded_result_type output_row["geog_source"] = "geocodage" writer.write_row_dict(output_row) session.close()
def prepare_code_postal(): code_postal_source = Dataset("etl", "code_postal_source") code_postal = Dataset("etl", "code_postal") dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), Column("code_insee", String), Column("code_postal", String), Column("nom_commune", String), Column("version", Integer) ] code_postal.write_dtype(dtype) with code_postal.get_writer() as writer: for row in code_postal_source.iter_rows( primary_key="Code_commune_INSEE"): output_row = { "code_insee": row["Code_commune_INSEE"], "code_postal": row["Code_postal"], "nom_commune": row["Nom_commune"] } writer.write_row_dict(output_row)
def load_bottleneck_data(training_file, validation_file, breadth): """ Utility function to load bottleneck features. Arguments: training_file - String validation_file - String """ print("Training file", training_file) print("Validation file", validation_file) print("Output breadth", breadth) with open(training_file, 'rb') as f: train_data = pickle.load(f) with open(validation_file, 'rb') as f: validation_data = pickle.load(f) X_train = train_data['features'] y_train = train_data['labels'] X_val = validation_data['features'] y_val = validation_data['labels'] D_train = Dataset('Training', Data(X_train), Likelihoods(y_train, breadth)) D_val = Dataset('Validation', Data(X_val), Likelihoods(y_val, breadth)) return (D_train, D_val)
def geocode(): """ Geocode Basol adresses """ # input dataset basol_filtered = Dataset("etl", "basol_filtered") # output dataset basol_geocoded = Dataset("etl", "basol_geocoded") # write output schema dtype = basol_filtered.read_dtype(primary_key="numerobasol") output_dtype = [ Column("id", BigInteger(), primary_key=True, autoincrement=True), *dtype, Column("geocoded_latitude", Float(precision=10)), Column("geocoded_longitude", Float(precision=10)), Column("geocoded_result_score", Float()), Column("geocoded_result_type", String()), Column("adresse_id", String()) ] basol_geocoded.write_dtype(output_dtype) with basol_geocoded.get_writer() as writer: for df in basol_filtered.get_dataframes(chunksize=100): df = df.replace({np.nan: None}) rows = df.to_dict(orient="records") payload = [{ "adresse": row["adresse"], "code_insee": row["code_insee"] } for row in rows] geocoded = bulk_geocode(payload, columns=["adresse"], citycode="code_insee") zipped = list(zip(rows, geocoded)) for (row, geocodage) in zipped: latitude = geocodage["latitude"] row["geocoded_latitude"] = float(latitude) \ if latitude else None longitude = geocodage["longitude"] row["geocoded_longitude"] = float(longitude) \ if longitude else None result_score = geocodage["result_score"] row["geocoded_result_score"] = float(result_score) \ if result_score else None row["geocoded_result_type"] = geocodage["result_type"] if row["geocoded_result_type"] == precisions.HOUSENUMBER and \ row["geocoded_result_score"] > 0.6: row["adresse_id"] = geocodage["result_id"] else: row["adresse_id"] = None writer.write_row_dict(row)
def pretrain_RNADE(self,): print 'Pre-training the RNADE' l2 = 2. rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2) batch_size = 100 num_examples = 100 filename = 'pre_train_params.pickle' learning_rate = self.learning_rate_pretrain train_data = mocap_data.sample_train_seq(batch_size) for i in xrange(1,num_examples): train_data = numpy.vstack((train_data,mocap_data.sample_train_seq(batch_size))) numpy.random.shuffle(train_data) total_num = train_data.shape[0] train_frac = 0.8 train_dataset = Dataset([train_data[0:int(train_frac*total_num)]],100) valid_dataset = Dataset([train_data[int(train_frac*total_num):]],100) optimiser = SGD_Optimiser(rnade.params,[rnade.v],[rnade.cost,rnade.ll_cost,rnade.l2_cost],momentum=True,patience=20,clip_gradients=self.clip_gradients) optimiser.train(train_dataset,valid_set=valid_dataset,learning_rate=learning_rate,num_epochs=5,save=True, lr_update=True,update_type='linear',start=2,output_folder=self.output_folder,filename=filename) self.plot_costs(optimiser,fig_title='Pretraining cost',filename='pretraining.png') print 'Done pre-training.' ####load best params from pre-training### print 'Loading best RNADE parameters' rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2) rnade.load_model(self.output_folder,filename=filename) ########### for param in rnade.params: value = param.get_value() self.model.params_dict[param.name].set_value(value) print 'Done pre-training.' #Saving results to dict self.results['pretraining_train_costs'] = optimiser.train_costs self.results['pretraining_valid_costs'] = optimiser.valid_costs
def normalize_precision(): """ Cette recette permet de normaliser les valeurs de la colonne lib_precis dans la nomenclature PARCEL, HOUSENUMBER, MUNICIPALITY """ # input dataset s3ic_geocoded = Dataset("etl", "s3ic_geocoded") # output dataset s3ic_normalized = Dataset("etl", "s3ic_normalized") dtype = s3ic_geocoded.read_dtype() s3ic_normalized.write_dtype(dtype) with s3ic_normalized.get_writer() as writer: for row in s3ic_geocoded.iter_rows(): mapping = { "Coordonnées précises": precisions.PARCEL, "Coordonnée précise": precisions.PARCEL, "Valeur Initiale": precisions.PARCEL, "Adresse postale": precisions.HOUSENUMBER, "Centroïde Commune": precisions.MUNICIPALITY, "Inconnu": precisions.MUNICIPALITY } precision = row.get("precision") if precision: row["precision"] = mapping.get(precision) else: row["precision"] = precisions.MUNICIPALITY writer.write_row_dict(row)
def initialize(self, source, target, batch_size1, batch_size2, scale=32, shuffle_=False): transform = transforms.Compose([ transforms.Resize(scale), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) dataset_source = Dataset(source['imgs'], source['labels'], transform=transform) dataset_target = Dataset(target['imgs'], target['labels'], transform=transform) data_loader_s = torch.utils.data.DataLoader( dataset_source, batch_size=batch_size1, shuffle=shuffle_, num_workers=4 ) data_loader_t = torch.utils.data.DataLoader( dataset_target, batch_size=batch_size2, shuffle=shuffle_, num_workers=4 ) self.dataset_s = dataset_source self.dataset_t = dataset_target self.paired_data = PairedData(data_loader_s, data_loader_t, float("inf"))
def main(): """ Main process. """ args = parse_cli_args() config = TrainConfig() train_ds = Dataset(args.train_path) valid_ds = Dataset(args.valid_path) model = make_model() optimizer = getattr(optim, config.optimizer_name)(model.parameters(), lr=config.learning_rate) training = Training( train_ds, valid_ds, model, optimizer, config.batch_size, config.epochs, ) training.train()
def filter_departements(): """ Filtre les données pour conserver uniquement les enregistrements localisés dans les départements sélectionnés dans la config """ # Input dataset sis_source = Dataset("etl", "sis_source") # output dataset sis_filtered = Dataset("etl", "sis_filtered") sis_filtered.write_dtype(sis_source.read_dtype()) with sis_filtered.get_writer() as writer: for row in sis_source.iter_rows(): code_insee = row["code_insee"] keep_row = False for departement in DEPARTEMENTS: if code_insee.startswith(departement): keep_row = True break if keep_row: writer.write_row_dict(row)
def prepare_sites(): """ Cette recette ajoute une clé primaire et garde uniquement certaines colonnes """ # input dataset basias_sites_filtered = Dataset("etl", "basias_sites_filtered") # output dataset basias_sites_prepared = Dataset("etl", "basias_sites_prepared") # columns to keep keep = ["indice_departemental", "nom_usuel", "raison_sociale"] dtype = basias_sites_filtered.read_dtype() # transform schema output_dtype = [column for column in dtype if column.name in keep] id_column = Column("id", BigInteger, primary_key=True, autoincrement=True) output_dtype = [id_column, *output_dtype] basias_sites_prepared.write_dtype(output_dtype) # transform data with basias_sites_prepared.get_writer() as writer: for row in basias_sites_filtered.iter_rows(): output_row = dict((key, row[key]) for key in row if key in keep) writer.write_row_dict(output_row)
def scrap_adresses(): """ Scrappe les adresses présentes sur les fiches détails Géorisques Exemple: À partir de l'url http://www.installationsclassees.developpement-durable.gouv.fr /ficheEtablissement.php?champEtablBase=61&champEtablNumero=14605 On extraie => Lieu dit 'Les Murettes' 26300 BEAUREGARD BARET Pour des raisons de performance, on scrappe uniquement les adresses pour les enregistrements dont la précision est "Centroïde Commune" """ # input dataset s3ic_filtered = Dataset("etl", "s3ic_source") # output dataset s3ic_scraped = Dataset("etl", "s3ic_scraped") dtype = s3ic_filtered.read_dtype() output_dtype = [*dtype, Column("adresse", String)] s3ic_scraped.write_dtype(output_dtype) with s3ic_scraped.get_writer() as writer: for df in s3ic_filtered.get_dataframes(chunksize=100): filtered = df.loc[(df["lib_precis"] == "Centroïde Commune") & (df["url_fiche"].notnull())].copy() urls = filtered["url_fiche"].tolist() scrapers = [IcpeScraper(url) for url in urls] fetch_parallel(scrapers) for scraper in scrapers: scraper.parse() scraper.find_adresse() filtered["adresse"] = [s.adresse for s in scrapers] def f(row): try: return filtered["adresse"].loc[row.name] except KeyError: return None df["adresse"] = df.apply(lambda row: f(row), axis=1) writer.write_dataframe(df)
def add_geog(): """ Cette recette réalise une jointure avec la table cadastre pour nettoyer les parcelles invalide. On utilise la table cadastre du schéma kelrisks et non la table du schéma etl car il n'y a pas assez de stockage sur le serveur pour avoir 4 tables cadastre (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`). On est donc obligé de supprimer les tables cadastres du schéma `etl` après les avoir copié dans le schéma `kelrisks`. La table `etl.cadastre` n'existe donc pas forcément au moment où ce DAG est executé. """ # Input dataset basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed") cadastre = Dataset("kelrisks", "cadastre") # Output dataset basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog") BasiasCadastreParsed = basias_cadastre_parsed.reflect() Cadastre = cadastre.reflect() basias_cadastre_with_geog.write_dtype([ *basias_cadastre_parsed.read_dtype(), Column("geog", Geometry(srid=4326)) ]) session = basias_cadastre_parsed.get_session() cond = (BasiasCadastreParsed.commune == Cadastre.commune) & \ (BasiasCadastreParsed.section == Cadastre.section) & \ (BasiasCadastreParsed.numero == Cadastre.numero) q = session.query(BasiasCadastreParsed, Cadastre.geog) \ .join(Cadastre, cond) \ .yield_per(500) with basias_cadastre_with_geog.get_writer() as writer: for (row, geog) in q: output_row = {**row2dict(row), "geog": None} del output_row["id"] if geog is not None: output_row["geog"] = geog writer.write_row_dict(output_row) session.close()
def filter_departements(): basol_source = Dataset("etl", "basol_source") basol_filtered = Dataset("etl", "basol_filtered") basol_filtered.write_dtype([ Column("id", BigInteger, primary_key=True, autoincrement=True), *basol_source.read_dtype(primary_key="numerobasol") ]) with basol_filtered.get_writer() as writer: for row in basol_source.iter_rows(): if row["departement"] in DEPARTEMENTS: writer.write_row_dict(row)
def join_sites_localisation(): """ Réalise une jointure entre la table sites et la table localisation """ # datasets to join basias_sites_prepared = Dataset("etl", "basias_sites_prepared") basias_localisation_with_cadastre = Dataset( "etl", "basias_localisation_with_cadastre") # output dataset basias_sites_localisation_joined = Dataset( "etl", "basias_sites_localisation_joined") # transform types output_dtype = [ *basias_sites_prepared.read_dtype(), *basias_localisation_with_cadastre.read_dtype() ] output_dtype = merge_dtype(output_dtype) basias_sites_localisation_joined.write_dtype(output_dtype) # transform data BasiasSitesPrepared = basias_sites_prepared.reflect() BasiasLocalisation = basias_localisation_with_cadastre.reflect() session = basias_sites_prepared.get_session() join_query = session \ .query(BasiasSitesPrepared, BasiasLocalisation) \ .join(BasiasLocalisation, BasiasSitesPrepared.indice_departemental == BasiasLocalisation.indice_departemental, isouter=True) \ .all() with basias_sites_localisation_joined.get_writer() as writer: for (site, localisation) in join_query: output_row = {c.name: None for c in output_dtype} output_row = {**output_row, **row2dict(site)} if localisation: output_row = {**output_row, **row2dict(localisation)} del output_row["id"] writer.write_row_dict(output_row) session.close()
def create_loaders(test_dir, test_list, normalise_params, num_workers): # Torch libraries from torchvision import transforms from torch.utils.data import DataLoader, random_split # Custom libraries if args.data_name == 'nyu': from datasets import NYUDataset as Dataset elif args.data_name == 'cityscapes': from datasets import CSDataset as Dataset from datasets import ToTensor, Normalise composed_test = transforms.Compose( [Normalise(*normalise_params), ToTensor()]) ## Test Set ## testset = Dataset(data_file=test_list, data_dir=test_dir, transform_trn=None, transform_val=composed_test) logger.info(" Created test set with {} examples".format(len(testset))) ## Test Loader ## test_loader = DataLoader(testset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=True) return test_loader
def generate_folds_for_dataset(): dataset_names = Dataset.get_dataset_names() + Dataset.interesting_2d_datasets() for dataset_name in dataset_names: dataset = Dataset(dataset_name) print("making folds for dataset ", dataset_name) os.makedirs(os.path.join(FOLD_PATH, dataset_name), exist_ok=True) for run_nb in range(10): # toon's code # skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle = True) # skf = KFold(n_splits=10, shuffle=True) labels = dataset.target for fold_nb, (train_indices, test_indices) in enumerate(skf.split(np.zeros(len(labels)), labels)): to_write = dict() to_write["train_indices"] = train_indices.tolist() to_write["test_indices"] = test_indices.tolist() if os.path.isfile(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb))): print("fold file already exists! not overwriting!") continue with open(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb)), mode = 'w') as fold_file: json.dump(to_write, fold_file)
def __init__(self): #cfg_from_file("./cfgs/shape32.yml") cfg_from_file("./cfg/32_chair_table.yml") data_path = cfg.DATA_DIR + cfg.DATASET_NAME self.dataset = Dataset(data_path, cfg.SPLIT, cfg.CATEGORY, cfg.CHANNEL) self.wordtoix = self.dataset.wordtoix text_encoder = RNN_ENCODER(self.dataset.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) #state_dict = torch.load("./data/models/text_encoder66.pth", map_location=lambda storage, loc: storage) state_dict = torch.load("./networks/pretrain/text_encoder.pth", map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) for p in text_encoder.parameters(): p.requires_grad = False print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder.eval() self.text_encoder = text_encoder self.netG = G_NET(4) # self.netG = torch.nn.DataParallel(netG) #state_dict = torch.load('./data/models/netG_epoch_39.pth', map_location=lambda storage, loc: storage) state_dict = torch.load("./networks/pretrain/netG_epoch.pth", map_location=lambda storage, loc: storage) self.netG.load_state_dict(state_dict) print('Load G from: ', cfg.TRAIN.NET_G) self.netG.eval() self.noise = torch.randn(1, 128) self.noise.data.normal_(0.5, 0.5)
def run(config): print(config) checkConfigParams(config) np.random.seed(config["random_seed"]) random.seed(config["random_seed"]) dataset = Dataset(**config) # preconditions: the dataset has generated a pool of data, and the folds have been generated if config["kfcv"] == 1 and config["kfcv_serial"] == 1: for i in range(0, config["folds"]): neuralnet = ConvModel(dataset, **config) #validate that the folds work #dataset.kfcvPrintFoldInfo() #xit() gc.collect() runNet(neuralnet, config, i) # preconditions: the dataset has generated a pool of data, and the folds have been generated elif config["kfcv"] == 1: neuralnet = ConvModel(dataset, **config) runNet(neuralnet, config) # preconditions: the dataset has generated distinct training and testing set else: neuralnet = ConvModel(dataset, **config) runNet(neuralnet, config)
def main(): args = ParseArgs() model = args.model dataset_name = args.dataset_name lambda_ = args.lambda_ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') trainloader, testloader, num_classes = Dataset(dataset_name) model = torch.load(args.ckpt).to(device) weights = [w for name, w in model.named_parameters() if "weight" in name] num_features = sum([w.numel() for w in weights]) criterion = torch.nn.CrossEntropyLoss() F, f, norm_l1_x = compute_F( trainloader, model, weights, criterion, lambda_) density = sum([torch.sum(w != 0).item() for w in weights]) / num_features accuracy = check_accuracy(model, testloader) print('F:', F) print('f:', f) print('density:', density) print('validation accuracy:', accuracy)
def dummy_test(): datasets = [Dataset("iris")] clusterer = COBRAS(correct_noise=True, noise_probability=0.05, certainty_threshold_keep=0.96, certainty_threshold_reuse=0.96) test_active_clustering_algorithm_n_times_n_fold( "noise_robust_COBRAS", clusterer, ProbabilisticNoisyQuerierBuilder(0.05, 100), datasets) clusterer = COBRAS(correct_noise=False, noise_probability=0.05, certainty_threshold_keep=0.96, certainty_threshold_reuse=0.96) test_active_clustering_algorithm_n_times_n_fold( "COBRAS_noise", clusterer, ProbabilisticNoisyQuerierBuilder(0.05, 100), datasets) clusterer = COBRAS(correct_noise=False, noise_probability=0.05, certainty_threshold_keep=0.96, certainty_threshold_reuse=0.96) test_active_clustering_algorithm_n_times_n_fold( "COBRAS_no_noise", clusterer, ProbabilisticNoisyQuerierBuilder(0, 100), datasets) calculate_all_aris() calculate_all_average_aris() compare_algorithms_and_plot_results_n_times_n_fold( "test_cobras", ["noise_robust_COBRAS", "COBRAS_no_noise", "COBRAS_noise"])
def PCK_means_noise_vs_no_noise(): names = [ "breast-cancer-wisconsin", "column_2C", "dermatology", "ecoli", "glass", "hepatitis", "ionosphere", "iris" ] datasets = [Dataset(name) for name in names] clusterer = QueryFirstActiveClusterer( MyMPCKMeans(learn_multiple_full_matrices=True)) test_active_clustering_algorithm_n_times_n_fold( "random_MPCK_means_full_no_noise", clusterer, RandomQuerierBuilder(None, 100, 0), datasets, nb_cores=4, n=1) test_active_clustering_algorithm_n_times_n_fold( "random_MPCK_means_full_noise", clusterer, RandomQuerierBuilder(None, 100, 0.15), datasets, nb_cores=4, n=1) calculate_all_aris() calculate_all_average_aris() comparison_name = "full MPCK-means noise vs no noise" algorithms = [ "random_MPCK_means_full_no_noise", "random_MPCK_means_full_noise" ] compare_algorithms_and_plot_results_n_times_n_fold(comparison_name, algorithms)
def get_dataloader(imsize, batch_size): global subdataset_idx bshuffle = True image_transform = transforms.Compose([ transforms.Resize(int(imsize * 76 / 64)), transforms.RandomCrop(imsize), transforms.RandomHorizontalFlip() ]) dataset = Dataset(cfg.DATA_DIR, imsize=imsize, transform=image_transform) if cfg.TRAIN.DATASET_SIZE != -1: if subdataset_idx is None: subdataset_idx = random.sample(range(0, len(dataset)), cfg.TRAIN.DATASET_SIZE) dataset = torch.utils.data.Subset(dataset, subdataset_idx) assert dataset print('training dataset size: ', len(dataset)) num_gpu = len(cfg.GPU_ID.split(',')) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size * num_gpu, drop_last=True, shuffle=bshuffle, num_workers=int(cfg.WORKERS)) return dataloader
def get_dataset(dataset_id): dataset = DatasetService.get_dataset_meta(dataset_id) resource = None incs = None defaults = None meta = None if dataset: for res in dataset['resources']: if res['format'].lower() == 'csv': resource = res if res['format'].lower() == 'json' and res['name'].lower( ) == 'incsinfo': incs = res if res['format'].lower() == 'json' and res['name'].lower( ) == 'defaultinfo': defaults = res if res['format'].lower() == 'yml' and res['name'].lower( ) == 'metadata': meta = res if resource: table_name = resource['id'] if incs: incs = incs.get('url') if defaults: defaults = defaults.get('url') if meta: meta = meta.get('url') return Dataset(table_name, dataset, incs, defaults, meta) else: raise toolkit.ObjectNotFound( "There's no resource for the given dataset")
def run(self): algorithm = algorithm_info_to_object(self.algorithm_name, self.algorithm_parameters) querier_builder = querier_info_to_object(self.querier_name, self.querier_parameters) dataset = Dataset(self.dataset_name) train_indices = fold_path_to_train_indices(self.fold_path) querier = querier_builder.build_querier(dataset) result = None # retry to execute the algorithm 10 times # this is because COSC does not always produce a result and ends with an exception try: result = algorithm.fit(dataset.data, dataset.number_of_classes(), train_indices, querier) except Exception as e: print("An exception occured during calculation of {} (this is silently ignored):".format(self.result_path), file = sys.stderr) traceback.print_exc() if result is None: return # None is not json serializable so use the string "None" instead train_indices = train_indices if train_indices is not None else "None" full_result = result + (train_indices,) os.makedirs(os.path.dirname(self.result_path), exist_ok=True) with open(self.result_path, mode="w") as result_file: json.dump(full_result, result_file)
def run(self): dataset = Dataset(self.dataset_name) target = dataset.target directory = os.path.join(COP_RF_LIKE_DIR, self.test_name, 'clusterings', self.dataset_name) result_aris = dict() for run_name in os.listdir(directory): constraints, noise = run_name[:-4].split(",") # con_per = float(constraints[11:]) noise_per = float(noise[5:]) # "constraints{},noise{}.txt" full_path = os.path.join(directory, run_name) with open(full_path, mode='r') as clustering_file: clusterings, runtime, ml, cl, train_indices = json.load( clustering_file) last_clustering = clusterings[-1] if train_indices == "None": ari = get_ARI(last_clustering, target) else: ari = get_ARI(last_clustering, target, train_indices=train_indices) result_aris[noise_per] = ari ari_list_in_order = [v for k, v in sorted(result_aris.items())] result_file = os.path.join(COP_RF_LIKE_DIR, self.test_name, "aris", self.dataset_name + "_aris.txt") os.makedirs(os.path.dirname(result_file), exist_ok=True) with open(result_file, mode="w") as result: json.dump(ari_list_in_order, result)
def matlab_test(): dataset = Dataset("iris") clusterer = MyCOSCMatlab() clusterer.signal_start(dataset.data) result = clusterer.fit(dataset.data, [(1,2),(2,3),(3,dataset.number_of_instances())], [(10,12),(23,16)], dataset.number_of_classes()) print(result) clusterer.signal_end()