Пример #1
0
def runalldatasetsMPI(args,
                      callbacks,
                      datasetlist,
                      mpicomm,
                      mpirank,
                      rootpath,
                      runlist,
                      alphalist,
                      n,
                      printcvresults=False,
                      printcv=False,
                      doevaluation=True,
                      evalcount=False):
    datasetsdone = list()
    dataset_results = dict()
    for dataset in datasetlist:
        d = Dataset(dataset)
        datasetsdone.append(dataset)
        # d = ds.getDataset(dataset)
        dsl, colmap, stratified_fold_generator = fromDataSetToSKLearn(
            d, args.onehot, n_splits=args.kfold)
        data = dsl.getFeatures()
        target = dsl.getTargets()

        anynominal = False
        datasetInfo = dsl.dataset.datasetInfo
        if not args.onehot:
            for key, value in colmap.items():
                isclass = False
                for col in datasetInfo["cols"]:
                    if col["name"] == key and col["class"] == True:
                        isclass = True
                    if value["type"] is "nominal" and not isclass:
                        anynominal = True

        # kfold = 0
        min_retain_losses = list()
        min_losses = list()
        dataset_result = dict()
        fold_number = 0
        # for each split in the kfold validation
        foldlist = list()
        for train, test in stratified_fold_generator:
            foldlist.append((train, test))
        mpicomm.barrier()

        train, test = foldlist[mpirank]
        if printcv:
            trainstr = "-".join(str(x) for x in train)
            trainhash = hashlib.md5(trainstr.encode()).digest()

            teststr = "-".join(str(x) for x in test)
            testhash = hashlib.md5(teststr.encode()).digest()

            print(
                f"summary of this cv-fold, first train: {train[0]} trainhash: {trainhash}"
                f"first test: {test[0]} testhash: {testhash}")
        fold_number = mpirank
        fold_results = dict()
        mpi_rank_rootpath = f"{rootpath}/{mpirank}"
        createdir(mpi_rank_rootpath)
        # dataset_results[str(fold_number)] = fold_results,
        # run all the methods in the split, so we can compare them internally
        # later if they are within each others standard deviation
        fold_results, ranlist = rundataset(anynominal,
                                           args,
                                           callbacks,
                                           colmap,
                                           data,
                                           dataset,
                                           dsl,
                                           fold_results,
                                           min_losses,
                                           min_retain_losses,
                                           mpi_rank_rootpath,
                                           runlist,
                                           target,
                                           test,
                                           train,
                                           alphalist=alphalist,
                                           printcvresults=printcvresults,
                                           n=n,
                                           doevaluation=doevaluation,
                                           fold_number=fold_number)
        mpicomm.barrier()
        if mpirank == 0:
            dataset_result[str(fold_number)] = fold_results
            for i in range(1, args.kfold):
                recv_fold_results = mpicomm.recv(source=i)
                dataset_result[i] = recv_fold_results
        elif mpirank != 0:
            mpicomm.send(fold_results, dest=0)

        if mpirank == 0:
            dataset_results[dataset] = dataset_result
            # if not args.mlp:
            writejson(f"{rootpath}/n{n}-data.json", dataset_results)
            #if len(alphalist) == 1:
            #    plotResults3(datasetsdone, dataset_results, rootpath, args.kfold)
            printSummary(dataset, dataset_result, ranlist, n, args)
        # modelsize = get_model_memory_usage(args.gabel_batchsize,gabel_model)
        # model size is in GB, so setting this as gpu fraction is 12 x what we need..
        # set_keras_parms(threads=0,gpu_fraction=modelsize)
    return dataset_results
Пример #2
0
def main(mpirank, mpisize, mpicomm):
    args = getArgs()

    if args.seed is None:
        seed = random.randrange(sys.maxsize)
        args.seed = seed
        print(f"generating new random seed:{seed}")
    random.seed(args.seed)
    datasetlist = args.datasets

    k = args.kfold
    results = {}
    runlist = args.methods

    if "," in args.gpu:
        gpus = args.gpu.split(",")
        mygpu = gpus[mpirank % 2]
        set_keras_growth(int(mygpu))
    else:
        set_keras_growth(args.gpu)

    dataset_results = dict()
    prefix = "runner"
    if args.prefix is not None:
        prefix = args.prefix
    rootpath = prefix + datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    if mpirank == 0:
        createdir(rootpath)
        writejson(f"{rootpath}/settings.json", sys.argv[1:])

    min_retain_losses = list()
    min_losses = list()

    datasetsdone = list()
    if args.callbacks is not None:
        callbacks = args.callbacks
    else:
        callbacks = list()
    nresults = list()
    alphalist = [
        0.8
    ]  # this code does not iterate over alpha, see mpi_deesweighting.py
    for i in range(0, args.n):
        dataset_results = runalldatasetsMPI(args,
                                            callbacks,
                                            datasetlist,
                                            mpicomm,
                                            mpirank,
                                            rootpath,
                                            runlist,
                                            alphalist,
                                            n=i,
                                            printcvresults=args.cvsummary,
                                            printcv=args.printcv,
                                            doevaluation=args.doevaluation)
        nresults.append(dataset_results)

        if mpirank == 0:
            writejson(f"{rootpath}/data.json", nresults)
            resdf = pd.DataFrame(results)
            resdf.to_csv(
                f"{rootpath}/results_{args.kfold}kfold_{args.epochs}epochs_{args.onehot}onehot.csv"
            )
Пример #3
0
def main(mpirank, mpisize, mpicomm):
    args = getArgs()
    if args.seed is not None:
        random.seed(args.seed)
    datasetlist = args.datasets
    print(f"doing experiment with {datasetlist} in that order")

    k = args.kfold
    results = {}
    runlist = args.methods

    if "," in args.gpu:
        gpus = args.gpu.split(",")
        mygpu = gpus[mpirank % 2]
        set_keras_growth(int(mygpu))
    else:
        set_keras_growth(args.gpu)

    dataset_results = dict()
    prefix = "runner"
    if args.prefix is not None:
        prefix = args.prefix
    rootpath = prefix + datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    if mpirank == 0:
        createdir(rootpath)
        writejson(f"{rootpath}/settings.json", vars(args))

    min_retain_losses = list()
    min_losses = list()
    if args.alpharange is not None:
        splits = args.alpharange.split(":")
        alphastart = float(splits[0])
        alphastop = float(splits[1])
        alpharange = np.linspace(alphastart, alphastop, args.alphacount)
    else:
        alpharange = np.linspace(0.000001, 1.00001, args.alphacount)

    datasetsdone = list()
    if args.callbacks is not None:
        callbacks = args.callbacks
    else:
        callbacks = list()
    nresults = list()
    for i in range(0, args.n):
        dataset_results = runalldatasetsMPI(args,
                                            callbacks,
                                            datasetlist,
                                            mpicomm,
                                            mpirank,
                                            rootpath,
                                            runlist,
                                            alpharange,
                                            n=i,
                                            printcvresults=args.cvsummary,
                                            printcv=args.printcv)
        nresults.append(dataset_results)

        if mpirank == 0:
            # plotNAlphaResults(datasetlist, nresults, rootpath)
            writejson(f"{rootpath}/data.json", nresults)
            resdf = pd.DataFrame(results)
            resdf.to_csv(
                f"{rootpath}/results_{args.kfold}kfold_{args.epochs}epochs_{args.onehot}onehot.csv"
            )