示例#1
0
def join_cadastre():
    """ Join the table basol_cadastre with cadastre table """

    # Input datasets
    basol_cadastre = Dataset("etl", "basol_cadastre")
    cadastre = Dataset("etl", "cadastre")

    # Output datasets
    basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined")

    basol_cadastre_dtype = basol_cadastre.read_dtype()
    dtype = [*basol_cadastre_dtype, Column("geog", Geometry(srid=4326))]
    basol_cadastre_joined.write_dtype(dtype)

    BasolCadastre = basol_cadastre.reflect()
    Cadastre = cadastre.reflect()

    session = basol_cadastre.get_session()

    cond = (BasolCadastre.commune == Cadastre.commune) & \
           (BasolCadastre.section == Cadastre.section) & \
           (BasolCadastre.numero == Cadastre.numero)

    q = session.query(BasolCadastre, Cadastre.geog) \
        .join(Cadastre, cond) \
        .yield_per(500)

    with basol_cadastre_joined.get_writer() as writer:

        for (basol, geog) in q:
            output_row = {**row2dict(basol), "geog": geog}
            output_row.pop("id")
            writer.write_row_dict(output_row)

    session.close()
示例#2
0
def merge_cadastre():
    """ Merge the different parcelles into a MultiPolygon """

    # Input dataset
    basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined")

    # Output dataset
    basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged")

    dtype = [
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        Column("numerobasol", String),
        Column("geog", Geometry(srid=4326))
    ]
    basol_cadastre_merged.write_dtype(dtype)

    BasolCadastreJoined = basol_cadastre_joined.reflect()

    session = basol_cadastre_joined.get_session()

    select = [
        BasolCadastreJoined.numerobasol,
        func.st_multi(func.st_union(BasolCadastreJoined.geog))
    ]

    q = session.query(*select) \
               .group_by(BasolCadastreJoined.numerobasol) \
               .all()

    with basol_cadastre_merged.get_writer() as writer:
        for (numerobasol, geog) in q:
            row = {"numerobasol": numerobasol, "geog": geog}
            writer.write_row_dict(row)

    session.close()
示例#3
0
def add_parcels():
    """ join table basol_intersected with basol_cadastre_merged """

    # input datasets
    basol_intersected = Dataset("etl", "basol_intersected")
    basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged")

    # output datasets
    basol_with_parcels = Dataset("etl", "basol_with_parcels")

    BasolIntersected = basol_intersected.reflect()
    BasolCadastreMerged = basol_cadastre_merged.reflect()

    dtype = basol_intersected.read_dtype()
    basol_with_parcels.write_dtype(dtype)

    session = basol_intersected.get_session()

    cond = (BasolIntersected.numerobasol == BasolCadastreMerged.numerobasol)
    q = session.query(BasolIntersected, BasolCadastreMerged.geog) \
               .join(BasolCadastreMerged, cond, isouter=True) \
               .all()

    with basol_with_parcels.get_writer() as writer:
        for (row, geog) in q:
            if geog is not None:
                row.geog_precision = precisions.PARCEL
                row.geog_source = "cadastre"
                row.geog = geog
            writer.write_row_dict(row2dict(row))

    session.close()
示例#4
0
def parse_cadastre():

    basol_source = Dataset("etl", "basol_source")
    basol_cadastre = Dataset("etl", "basol_cadastre")

    dtype = [
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        Column("numerobasol", String),
        Column("commune", String),
        Column("section", String),
        Column("numero", String)
    ]

    basol_cadastre.write_dtype(dtype)

    with basol_cadastre.get_writer() as writer:
        for row in basol_source.iter_rows(primary_key="numerobasol"):
            cadastre_multi = row["cadastre_multi"]
            if cadastre_multi:
                parcelles = transformers.parse_cadastre(cadastre_multi)
                for parcelle in parcelles:
                    output_row = {
                        "numerobasol": row["numerobasol"],
                        **parcelle
                    }
                    writer.write_row_dict(output_row)
示例#5
0
def merge_geog():
    """
    Choose best precision between initial coordinates
    or geocoded coordinates if geog is not set from
    cadastre information
    """

    # Input dataset
    basol_geocoded = Dataset("etl", "basol_normalized")

    # Output dataset
    basol_geog_merged = Dataset("etl", "basol_geog_merged")

    basol_geog_merged.write_dtype([
        *basol_geocoded.read_dtype(),
        Column("geog", Geometry(srid=4326)),
        Column("geog_precision", String),
        Column("geog_source", String)
    ])

    BasolGeocoded = basol_geocoded.reflect()

    session = basol_geocoded.get_session()

    point_lambert2 = func.ST_Transform(
        func.ST_setSRID(
            func.ST_MakePoint(BasolGeocoded.coordxlambertii,
                              BasolGeocoded.coordylambertii), LAMBERT2), WGS84)

    point_geocoded = func.ST_setSRID(
        func.ST_MakePoint(BasolGeocoded.geocoded_longitude,
                          BasolGeocoded.geocoded_latitude), WGS84)

    q = session.query(BasolGeocoded, point_lambert2, point_geocoded).all()

    with basol_geog_merged.get_writer() as writer:

        for (row, point_lambert2, point_geocoded) in q:

            output_row = {
                **row2dict(row), "geog": None,
                "geog_precision": None,
                "geog_source": None
            }

            if row.l2e_precision == precisions.HOUSENUMBER:

                output_row["geog"] = point_lambert2
                output_row["geog_precision"] = row.l2e_precision
                output_row["geog_source"] = "lambert2"

            elif (row.geocoded_result_type == precisions.HOUSENUMBER) and \
                 (row.geocoded_result_score >= 0.6):
                output_row["geog"] = point_geocoded
                output_row["geog_precision"] = row.geocoded_result_type
                output_row["geog_source"] = "geocodage"

            writer.write_row_dict(output_row)

    session.close()
示例#6
0
def prepare_code_postal():

    code_postal_source = Dataset("etl", "code_postal_source")
    code_postal = Dataset("etl", "code_postal")

    dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        Column("code_insee", String),
        Column("code_postal", String),
        Column("nom_commune", String),
        Column("version", Integer)
    ]

    code_postal.write_dtype(dtype)

    with code_postal.get_writer() as writer:

        for row in code_postal_source.iter_rows(
                primary_key="Code_commune_INSEE"):

            output_row = {
                "code_insee": row["Code_commune_INSEE"],
                "code_postal": row["Code_postal"],
                "nom_commune": row["Nom_commune"]
            }

            writer.write_row_dict(output_row)
def load_bottleneck_data(training_file, validation_file, breadth):
    """
    Utility function to load bottleneck features.

    Arguments:
        training_file - String
        validation_file - String
    """
    print("Training file", training_file)
    print("Validation file", validation_file)
    print("Output breadth", breadth)

    with open(training_file, 'rb') as f:
        train_data = pickle.load(f)
    with open(validation_file, 'rb') as f:
        validation_data = pickle.load(f)

    X_train = train_data['features']
    y_train = train_data['labels']
    X_val = validation_data['features']
    y_val = validation_data['labels']

    D_train = Dataset('Training', Data(X_train), Likelihoods(y_train, breadth))
    D_val = Dataset('Validation', Data(X_val), Likelihoods(y_val, breadth))

    return (D_train, D_val)
示例#8
0
def geocode():
    """ Geocode Basol adresses """

    # input dataset
    basol_filtered = Dataset("etl", "basol_filtered")

    # output dataset
    basol_geocoded = Dataset("etl", "basol_geocoded")

    # write output schema
    dtype = basol_filtered.read_dtype(primary_key="numerobasol")

    output_dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        *dtype,
        Column("geocoded_latitude", Float(precision=10)),
        Column("geocoded_longitude", Float(precision=10)),
        Column("geocoded_result_score", Float()),
        Column("geocoded_result_type", String()),
        Column("adresse_id", String())
    ]

    basol_geocoded.write_dtype(output_dtype)

    with basol_geocoded.get_writer() as writer:

        for df in basol_filtered.get_dataframes(chunksize=100):

            df = df.replace({np.nan: None})
            rows = df.to_dict(orient="records")
            payload = [{
                "adresse": row["adresse"],
                "code_insee": row["code_insee"]
            } for row in rows]

            geocoded = bulk_geocode(payload,
                                    columns=["adresse"],
                                    citycode="code_insee")

            zipped = list(zip(rows, geocoded))

            for (row, geocodage) in zipped:
                latitude = geocodage["latitude"]
                row["geocoded_latitude"] = float(latitude) \
                    if latitude else None
                longitude = geocodage["longitude"]
                row["geocoded_longitude"] = float(longitude) \
                    if longitude else None
                result_score = geocodage["result_score"]
                row["geocoded_result_score"] = float(result_score) \
                    if result_score else None
                row["geocoded_result_type"] = geocodage["result_type"]

                if row["geocoded_result_type"] == precisions.HOUSENUMBER and \
                   row["geocoded_result_score"] > 0.6:
                    row["adresse_id"] = geocodage["result_id"]
                else:
                    row["adresse_id"] = None

                writer.write_row_dict(row)
示例#9
0
 def pretrain_RNADE(self,):
     print 'Pre-training the RNADE'
     l2 = 2.
     rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2)
     batch_size = 100
     num_examples = 100
     filename = 'pre_train_params.pickle'
     learning_rate = self.learning_rate_pretrain
     train_data = mocap_data.sample_train_seq(batch_size)
     for i in xrange(1,num_examples):
         train_data = numpy.vstack((train_data,mocap_data.sample_train_seq(batch_size)))
     numpy.random.shuffle(train_data)
     total_num = train_data.shape[0]
     train_frac = 0.8
     train_dataset = Dataset([train_data[0:int(train_frac*total_num)]],100)
     valid_dataset = Dataset([train_data[int(train_frac*total_num):]],100)
     optimiser = SGD_Optimiser(rnade.params,[rnade.v],[rnade.cost,rnade.ll_cost,rnade.l2_cost],momentum=True,patience=20,clip_gradients=self.clip_gradients)
     optimiser.train(train_dataset,valid_set=valid_dataset,learning_rate=learning_rate,num_epochs=5,save=True,
                 lr_update=True,update_type='linear',start=2,output_folder=self.output_folder,filename=filename)
     self.plot_costs(optimiser,fig_title='Pretraining cost',filename='pretraining.png')
     print 'Done pre-training.'
     ####load best params from pre-training###
     print 'Loading best RNADE parameters'
     rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2)
     rnade.load_model(self.output_folder,filename=filename)
     ###########
     for param in rnade.params:
         value = param.get_value()
         self.model.params_dict[param.name].set_value(value)
     print 'Done pre-training.'
     #Saving results to dict
     self.results['pretraining_train_costs'] = optimiser.train_costs
     self.results['pretraining_valid_costs'] = optimiser.valid_costs
def normalize_precision():
    """
    Cette recette permet de normaliser les valeurs
    de la colonne lib_precis dans la nomenclature
    PARCEL, HOUSENUMBER, MUNICIPALITY
    """

    # input dataset
    s3ic_geocoded = Dataset("etl", "s3ic_geocoded")

    # output dataset
    s3ic_normalized = Dataset("etl", "s3ic_normalized")

    dtype = s3ic_geocoded.read_dtype()
    s3ic_normalized.write_dtype(dtype)

    with s3ic_normalized.get_writer() as writer:

        for row in s3ic_geocoded.iter_rows():

            mapping = {
                "Coordonnées précises": precisions.PARCEL,
                "Coordonnée précise": precisions.PARCEL,
                "Valeur Initiale": precisions.PARCEL,
                "Adresse postale": precisions.HOUSENUMBER,
                "Centroïde Commune": precisions.MUNICIPALITY,
                "Inconnu": precisions.MUNICIPALITY
            }
            precision = row.get("precision")
            if precision:
                row["precision"] = mapping.get(precision)
            else:
                row["precision"] = precisions.MUNICIPALITY

            writer.write_row_dict(row)
示例#11
0
    def initialize(self, source, target, batch_size1, batch_size2, scale=32, shuffle_=False):
        transform = transforms.Compose([
                transforms.Resize(scale),
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        dataset_source = Dataset(source['imgs'], source['labels'], transform=transform)
        dataset_target = Dataset(target['imgs'], target['labels'], transform=transform)

        data_loader_s = torch.utils.data.DataLoader(
            dataset_source,
            batch_size=batch_size1,
            shuffle=shuffle_,
            num_workers=4
        )

        data_loader_t = torch.utils.data.DataLoader(
            dataset_target,
            batch_size=batch_size2,
            shuffle=shuffle_,
            num_workers=4
        )

        self.dataset_s = dataset_source
        self.dataset_t = dataset_target
        self.paired_data = PairedData(data_loader_s, data_loader_t, float("inf"))
示例#12
0
def main():
    """
    Main process.

    """
    args = parse_cli_args()
    config = TrainConfig()

    train_ds = Dataset(args.train_path)
    valid_ds = Dataset(args.valid_path)

    model = make_model()
    optimizer = getattr(optim, config.optimizer_name)(model.parameters(),
                                                      lr=config.learning_rate)

    training = Training(
        train_ds,
        valid_ds,
        model,
        optimizer,
        config.batch_size,
        config.epochs,
    )

    training.train()
示例#13
0
def filter_departements():
    """
    Filtre les données pour conserver uniquement
    les enregistrements localisés dans les
    départements sélectionnés dans la config
    """

    # Input dataset
    sis_source = Dataset("etl", "sis_source")

    # output dataset
    sis_filtered = Dataset("etl", "sis_filtered")

    sis_filtered.write_dtype(sis_source.read_dtype())

    with sis_filtered.get_writer() as writer:
        for row in sis_source.iter_rows():
            code_insee = row["code_insee"]
            keep_row = False
            for departement in DEPARTEMENTS:
                if code_insee.startswith(departement):
                    keep_row = True
                    break
            if keep_row:
                writer.write_row_dict(row)
示例#14
0
def prepare_sites():
    """
    Cette recette ajoute une clé primaire
    et garde uniquement certaines colonnes
    """

    # input dataset
    basias_sites_filtered = Dataset("etl", "basias_sites_filtered")

    # output dataset
    basias_sites_prepared = Dataset("etl", "basias_sites_prepared")

    # columns to keep
    keep = ["indice_departemental", "nom_usuel", "raison_sociale"]

    dtype = basias_sites_filtered.read_dtype()

    # transform schema
    output_dtype = [column for column in dtype if column.name in keep]

    id_column = Column("id", BigInteger, primary_key=True, autoincrement=True)

    output_dtype = [id_column, *output_dtype]

    basias_sites_prepared.write_dtype(output_dtype)

    # transform data
    with basias_sites_prepared.get_writer() as writer:
        for row in basias_sites_filtered.iter_rows():
            output_row = dict((key, row[key]) for key in row if key in keep)
            writer.write_row_dict(output_row)
def scrap_adresses():
    """
    Scrappe les adresses présentes sur les fiches détails Géorisques

    Exemple:

    À partir de l'url
    http://www.installationsclassees.developpement-durable.gouv.fr
    /ficheEtablissement.php?champEtablBase=61&champEtablNumero=14605

    On extraie => Lieu dit 'Les Murettes' 26300 BEAUREGARD BARET

    Pour des raisons de performance, on scrappe uniquement les adresses
    pour les enregistrements dont la précision est "Centroïde Commune"
    """

    # input dataset
    s3ic_filtered = Dataset("etl", "s3ic_source")

    # output dataset
    s3ic_scraped = Dataset("etl", "s3ic_scraped")

    dtype = s3ic_filtered.read_dtype()

    output_dtype = [*dtype, Column("adresse", String)]

    s3ic_scraped.write_dtype(output_dtype)

    with s3ic_scraped.get_writer() as writer:

        for df in s3ic_filtered.get_dataframes(chunksize=100):

            filtered = df.loc[(df["lib_precis"] == "Centroïde Commune")
                              & (df["url_fiche"].notnull())].copy()

            urls = filtered["url_fiche"].tolist()
            scrapers = [IcpeScraper(url) for url in urls]
            fetch_parallel(scrapers)

            for scraper in scrapers:
                scraper.parse()
                scraper.find_adresse()

            filtered["adresse"] = [s.adresse for s in scrapers]

            def f(row):
                try:
                    return filtered["adresse"].loc[row.name]
                except KeyError:
                    return None

            df["adresse"] = df.apply(lambda row: f(row), axis=1)

            writer.write_dataframe(df)
示例#16
0
def add_geog():
    """
    Cette recette réalise une jointure avec la table cadastre
    pour nettoyer les parcelles invalide.

    On utilise la table cadastre du schéma kelrisks
    et non la table du schéma etl car il n'y a pas assez
    de stockage sur le serveur pour avoir 4 tables cadastre
    (preprod `etl`, preprod `kelrisks`, prod `etl`, prod `kelrisks`).
    On est donc obligé de supprimer les tables cadastres
    du schéma `etl` après les avoir copié dans le schéma `kelrisks`.
    La table `etl.cadastre` n'existe donc pas forcément au moment
    où ce DAG est executé.
    """

    # Input dataset
    basias_cadastre_parsed = Dataset("etl", "basias_cadastre_parsed")
    cadastre = Dataset("kelrisks", "cadastre")

    # Output dataset
    basias_cadastre_with_geog = Dataset("etl", "basias_cadastre_with_geog")

    BasiasCadastreParsed = basias_cadastre_parsed.reflect()
    Cadastre = cadastre.reflect()

    basias_cadastre_with_geog.write_dtype([
        *basias_cadastre_parsed.read_dtype(),
        Column("geog", Geometry(srid=4326))
    ])

    session = basias_cadastre_parsed.get_session()

    cond = (BasiasCadastreParsed.commune == Cadastre.commune) & \
           (BasiasCadastreParsed.section == Cadastre.section) & \
           (BasiasCadastreParsed.numero == Cadastre.numero)

    q = session.query(BasiasCadastreParsed, Cadastre.geog) \
        .join(Cadastre, cond) \
        .yield_per(500)

    with basias_cadastre_with_geog.get_writer() as writer:

        for (row, geog) in q:
            output_row = {**row2dict(row), "geog": None}
            del output_row["id"]
            if geog is not None:
                output_row["geog"] = geog

            writer.write_row_dict(output_row)

    session.close()
示例#17
0
def filter_departements():

    basol_source = Dataset("etl", "basol_source")
    basol_filtered = Dataset("etl", "basol_filtered")

    basol_filtered.write_dtype([
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        *basol_source.read_dtype(primary_key="numerobasol")
    ])

    with basol_filtered.get_writer() as writer:
        for row in basol_source.iter_rows():
            if row["departement"] in DEPARTEMENTS:
                writer.write_row_dict(row)
示例#18
0
def join_sites_localisation():
    """
    Réalise une jointure entre la table sites et la table localisation
    """

    # datasets to join
    basias_sites_prepared = Dataset("etl", "basias_sites_prepared")
    basias_localisation_with_cadastre = Dataset(
        "etl", "basias_localisation_with_cadastre")

    # output dataset
    basias_sites_localisation_joined = Dataset(
        "etl", "basias_sites_localisation_joined")

    # transform types
    output_dtype = [
        *basias_sites_prepared.read_dtype(),
        *basias_localisation_with_cadastre.read_dtype()
    ]

    output_dtype = merge_dtype(output_dtype)

    basias_sites_localisation_joined.write_dtype(output_dtype)

    # transform data
    BasiasSitesPrepared = basias_sites_prepared.reflect()
    BasiasLocalisation = basias_localisation_with_cadastre.reflect()

    session = basias_sites_prepared.get_session()

    join_query = session \
        .query(BasiasSitesPrepared, BasiasLocalisation) \
        .join(BasiasLocalisation,
              BasiasSitesPrepared.indice_departemental ==
              BasiasLocalisation.indice_departemental,
              isouter=True) \
        .all()

    with basias_sites_localisation_joined.get_writer() as writer:
        for (site, localisation) in join_query:
            output_row = {c.name: None for c in output_dtype}
            output_row = {**output_row, **row2dict(site)}
            if localisation:
                output_row = {**output_row, **row2dict(localisation)}
            del output_row["id"]
            writer.write_row_dict(output_row)

    session.close()
def create_loaders(test_dir, test_list, normalise_params, num_workers):
    # Torch libraries
    from torchvision import transforms
    from torch.utils.data import DataLoader, random_split
    # Custom libraries
    if args.data_name == 'nyu':
        from datasets import NYUDataset as Dataset
    elif args.data_name == 'cityscapes':
        from datasets import CSDataset as Dataset
    from datasets import ToTensor, Normalise

    composed_test = transforms.Compose(
        [Normalise(*normalise_params),
         ToTensor()])
    ## Test Set ##
    testset = Dataset(data_file=test_list,
                      data_dir=test_dir,
                      transform_trn=None,
                      transform_val=composed_test)

    logger.info(" Created test set with {} examples".format(len(testset)))
    ## Test Loader ##
    test_loader = DataLoader(testset,
                             batch_size=1,
                             shuffle=False,
                             num_workers=num_workers,
                             pin_memory=True)
    return test_loader
示例#20
0
def generate_folds_for_dataset():
    dataset_names = Dataset.get_dataset_names() + Dataset.interesting_2d_datasets()

    for dataset_name in dataset_names:

        dataset = Dataset(dataset_name)
        print("making folds for dataset ", dataset_name)
        os.makedirs(os.path.join(FOLD_PATH, dataset_name), exist_ok=True)
        for run_nb in range(10):
            # toon's code
            # skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True)
            skf = StratifiedKFold(n_splits = 10, shuffle = True)
            # skf = KFold(n_splits=10, shuffle=True)
            labels = dataset.target

            for fold_nb, (train_indices, test_indices) in enumerate(skf.split(np.zeros(len(labels)), labels)):

                to_write = dict()
                to_write["train_indices"] = train_indices.tolist()
                to_write["test_indices"] = test_indices.tolist()
                if os.path.isfile(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb))):
                    print("fold file already exists! not overwriting!")
                    continue
                with open(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb)), mode = 'w') as fold_file:
                    json.dump(to_write, fold_file)
示例#21
0
  def __init__(self):
    #cfg_from_file("./cfgs/shape32.yml")
    cfg_from_file("./cfg/32_chair_table.yml")
    data_path = cfg.DATA_DIR + cfg.DATASET_NAME
    self.dataset = Dataset(data_path, cfg.SPLIT, cfg.CATEGORY, cfg.CHANNEL)
    self.wordtoix = self.dataset.wordtoix


    text_encoder = RNN_ENCODER(self.dataset.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM)
    #state_dict = torch.load("./data/models/text_encoder66.pth", map_location=lambda storage, loc: storage)
    state_dict = torch.load("./networks/pretrain/text_encoder.pth", map_location=lambda storage, loc: storage)
    text_encoder.load_state_dict(state_dict)
    for p in text_encoder.parameters():
        p.requires_grad = False
    print('Load text encoder from:', cfg.TRAIN.NET_E)
    text_encoder.eval()
    self.text_encoder = text_encoder


    self.netG = G_NET(4)
    # self.netG = torch.nn.DataParallel(netG)
    #state_dict = torch.load('./data/models/netG_epoch_39.pth', map_location=lambda storage, loc: storage)
    state_dict = torch.load("./networks/pretrain/netG_epoch.pth", map_location=lambda storage, loc: storage)
    self.netG.load_state_dict(state_dict)
    print('Load G from: ', cfg.TRAIN.NET_G)
    self.netG.eval()

    self.noise = torch.randn(1, 128)
    self.noise.data.normal_(0.5, 0.5)
示例#22
0
def run(config):
    print(config)
    checkConfigParams(config)
    np.random.seed(config["random_seed"])
    random.seed(config["random_seed"])

    dataset = Dataset(**config)
    # preconditions: the dataset has generated a pool of data, and the folds have been generated
    if config["kfcv"] == 1 and config["kfcv_serial"] == 1:
        for i in range(0, config["folds"]):
            neuralnet = ConvModel(dataset, **config)
            #validate that the folds work
            #dataset.kfcvPrintFoldInfo()
            #xit()
            gc.collect()
            runNet(neuralnet, config, i)

    # preconditions: the dataset has generated a pool of data, and the folds have been generated
    elif config["kfcv"] == 1:
        neuralnet = ConvModel(dataset, **config)
        runNet(neuralnet, config)

    # preconditions: the dataset has generated distinct training and testing set
    else:
        neuralnet = ConvModel(dataset, **config)
        runNet(neuralnet, config)
示例#23
0
def main():
    args = ParseArgs()
    model = args.model
    dataset_name = args.dataset_name
    lambda_ = args.lambda_

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    trainloader, testloader, num_classes = Dataset(dataset_name)
    model = torch.load(args.ckpt).to(device)

    weights = [w for name, w in model.named_parameters() if "weight" in name]
    num_features = sum([w.numel() for w in weights])

    criterion = torch.nn.CrossEntropyLoss()

    F, f, norm_l1_x = compute_F(
        trainloader, model, weights, criterion, lambda_)
    density = sum([torch.sum(w != 0).item() for w in weights]) / num_features
    accuracy = check_accuracy(model, testloader)

    print('F:', F)
    print('f:', f)
    print('density:', density)
    print('validation accuracy:', accuracy)
示例#24
0
def dummy_test():
    datasets = [Dataset("iris")]
    clusterer = COBRAS(correct_noise=True,
                       noise_probability=0.05,
                       certainty_threshold_keep=0.96,
                       certainty_threshold_reuse=0.96)
    test_active_clustering_algorithm_n_times_n_fold(
        "noise_robust_COBRAS", clusterer,
        ProbabilisticNoisyQuerierBuilder(0.05, 100), datasets)
    clusterer = COBRAS(correct_noise=False,
                       noise_probability=0.05,
                       certainty_threshold_keep=0.96,
                       certainty_threshold_reuse=0.96)
    test_active_clustering_algorithm_n_times_n_fold(
        "COBRAS_noise", clusterer, ProbabilisticNoisyQuerierBuilder(0.05, 100),
        datasets)
    clusterer = COBRAS(correct_noise=False,
                       noise_probability=0.05,
                       certainty_threshold_keep=0.96,
                       certainty_threshold_reuse=0.96)
    test_active_clustering_algorithm_n_times_n_fold(
        "COBRAS_no_noise", clusterer, ProbabilisticNoisyQuerierBuilder(0, 100),
        datasets)
    calculate_all_aris()
    calculate_all_average_aris()
    compare_algorithms_and_plot_results_n_times_n_fold(
        "test_cobras",
        ["noise_robust_COBRAS", "COBRAS_no_noise", "COBRAS_noise"])
示例#25
0
def PCK_means_noise_vs_no_noise():
    names = [
        "breast-cancer-wisconsin", "column_2C", "dermatology", "ecoli",
        "glass", "hepatitis", "ionosphere", "iris"
    ]
    datasets = [Dataset(name) for name in names]
    clusterer = QueryFirstActiveClusterer(
        MyMPCKMeans(learn_multiple_full_matrices=True))
    test_active_clustering_algorithm_n_times_n_fold(
        "random_MPCK_means_full_no_noise",
        clusterer,
        RandomQuerierBuilder(None, 100, 0),
        datasets,
        nb_cores=4,
        n=1)
    test_active_clustering_algorithm_n_times_n_fold(
        "random_MPCK_means_full_noise",
        clusterer,
        RandomQuerierBuilder(None, 100, 0.15),
        datasets,
        nb_cores=4,
        n=1)
    calculate_all_aris()
    calculate_all_average_aris()
    comparison_name = "full MPCK-means noise vs no noise"
    algorithms = [
        "random_MPCK_means_full_no_noise", "random_MPCK_means_full_noise"
    ]
    compare_algorithms_and_plot_results_n_times_n_fold(comparison_name,
                                                       algorithms)
示例#26
0
def get_dataloader(imsize, batch_size):
    global subdataset_idx
    bshuffle = True
    image_transform = transforms.Compose([
        transforms.Resize(int(imsize * 76 / 64)),
        transforms.RandomCrop(imsize),
        transforms.RandomHorizontalFlip()
    ])

    dataset = Dataset(cfg.DATA_DIR, imsize=imsize, transform=image_transform)

    if cfg.TRAIN.DATASET_SIZE != -1:
        if subdataset_idx is None:
            subdataset_idx = random.sample(range(0, len(dataset)),
                                           cfg.TRAIN.DATASET_SIZE)
        dataset = torch.utils.data.Subset(dataset, subdataset_idx)

    assert dataset
    print('training dataset size: ', len(dataset))

    num_gpu = len(cfg.GPU_ID.split(','))
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size * num_gpu,
                                             drop_last=True,
                                             shuffle=bshuffle,
                                             num_workers=int(cfg.WORKERS))
    return dataloader
    def get_dataset(dataset_id):
        dataset = DatasetService.get_dataset_meta(dataset_id)

        resource = None
        incs = None
        defaults = None
        meta = None
        if dataset:
            for res in dataset['resources']:
                if res['format'].lower() == 'csv':
                    resource = res
                if res['format'].lower() == 'json' and res['name'].lower(
                ) == 'incsinfo':
                    incs = res
                if res['format'].lower() == 'json' and res['name'].lower(
                ) == 'defaultinfo':
                    defaults = res
                if res['format'].lower() == 'yml' and res['name'].lower(
                ) == 'metadata':
                    meta = res

        if resource:
            table_name = resource['id']
            if incs:
                incs = incs.get('url')
            if defaults:
                defaults = defaults.get('url')
            if meta:
                meta = meta.get('url')
            return Dataset(table_name, dataset, incs, defaults, meta)
        else:
            raise toolkit.ObjectNotFound(
                "There's no resource for the given dataset")
示例#28
0
    def run(self):
        algorithm = algorithm_info_to_object(self.algorithm_name, self.algorithm_parameters)
        querier_builder = querier_info_to_object(self.querier_name, self.querier_parameters)
        dataset = Dataset(self.dataset_name)
        train_indices = fold_path_to_train_indices(self.fold_path)
        querier = querier_builder.build_querier(dataset)
        result = None

        # retry to execute the algorithm 10 times
        # this is because COSC does not always produce a result and ends with an exception
        try:
            result = algorithm.fit(dataset.data, dataset.number_of_classes(), train_indices, querier)
        except Exception as e:
            print("An exception occured during calculation of {} (this is silently ignored):".format(self.result_path), file = sys.stderr)
            traceback.print_exc()

        if result is None:
            return

        # None is not json serializable so use the string "None" instead
        train_indices = train_indices if train_indices is not None else "None"
        full_result = result + (train_indices,)
        os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
        with open(self.result_path, mode="w") as result_file:
            json.dump(full_result, result_file)
    def run(self):
        dataset = Dataset(self.dataset_name)
        target = dataset.target
        directory = os.path.join(COP_RF_LIKE_DIR, self.test_name,
                                 'clusterings', self.dataset_name)
        result_aris = dict()
        for run_name in os.listdir(directory):

            constraints, noise = run_name[:-4].split(",")
            # con_per = float(constraints[11:])
            noise_per = float(noise[5:])
            # "constraints{},noise{}.txt"
            full_path = os.path.join(directory, run_name)
            with open(full_path, mode='r') as clustering_file:
                clusterings, runtime, ml, cl, train_indices = json.load(
                    clustering_file)
            last_clustering = clusterings[-1]
            if train_indices == "None":
                ari = get_ARI(last_clustering, target)
            else:
                ari = get_ARI(last_clustering,
                              target,
                              train_indices=train_indices)
            result_aris[noise_per] = ari
        ari_list_in_order = [v for k, v in sorted(result_aris.items())]
        result_file = os.path.join(COP_RF_LIKE_DIR, self.test_name, "aris",
                                   self.dataset_name + "_aris.txt")
        os.makedirs(os.path.dirname(result_file), exist_ok=True)
        with open(result_file, mode="w") as result:
            json.dump(ari_list_in_order, result)
示例#30
0
def matlab_test():
    dataset = Dataset("iris")
    clusterer = MyCOSCMatlab()
    clusterer.signal_start(dataset.data)
    result = clusterer.fit(dataset.data, [(1,2),(2,3),(3,dataset.number_of_instances())], [(10,12),(23,16)], dataset.number_of_classes())
    print(result)
    clusterer.signal_end()