Exemplo n.º 1
0
def collect_nm(processing_dir, collect=False, binary=False):
    """This function checks and collects all batches.

    ** Input:
        * processing_dir        -> Full path to the processing directory
        * collect               -> If True data is checked for failed batches
                                and collected; if False data is just checked

    ** Output:
        * Text files containing all results accross all batches the combined
          output

    written by (primarily) T Wolfers, (adapted) SM Kia
    """
    # import of necessary modules
    import os
    import sys
    import glob
    import numpy as np
    import pandas as pd
    try:
        import nispat.fileio as fileio
    except ImportError:
        pass
        path = os.path.abspath(os.path.dirname(__file__))
        if path not in sys.path:
            sys.path.append(path)
            del path
        import fileio

    if binary:
        file_extentions = '.pkl'
    else:
        file_extentions = '.txt'

    # detect number of subjects, batches, hyperparameters and CV
    file_example = glob.glob(processing_dir + 'batch_1/' + 'yhat' +
                             file_extentions)
    if binary is False:
        file_example = fileio.load(file_example[0])
    else:
        file_example = pd.read_pickle(file_example[0])
    numsubjects = file_example.shape[0]
    batch_size = file_example.shape[1]

    all_Hyptxt = glob.glob(processing_dir + 'batch_*/' + 'Hyp*')
    if all_Hyptxt != []:
        first_Hyptxt = fileio.load(all_Hyptxt[0])
        first_Hyptxt = first_Hyptxt.transpose()
        nHyp = len(first_Hyptxt)
        dir_first_Hyptxt = os.path.dirname(all_Hyptxt[0])
        all_crossval = glob.glob(dir_first_Hyptxt + '/' + 'Hyp*')
        n_crossval = len(all_crossval)

    # artificially creates files for batches that were not executed
    count = 0
    batch_fail = []
    for batch in glob.glob(processing_dir + 'batch_*/'):
        filepath = glob.glob(batch + 'yhat*')
        if filepath == []:
            count = count + 1
            batch1 = glob.glob(batch + '/*.sh')
            print(batch1)
            batch_fail.append(batch1)
            if collect is True:
                pRho = np.ones(batch_size)
                pRho = pRho.transpose()
                pRho = pd.Series(pRho)
                fileio.save(pRho, batch + 'pRho' + file_extentions)

                Rho = np.zeros(batch_size)
                Rho = Rho.transpose()
                Rho = pd.Series(Rho)
                fileio.save(Rho, batch + 'Rho' + file_extentions)

                rmse = np.zeros(batch_size)
                rmse = rmse.transpose()
                rmse = pd.Series(rmse)
                fileio.save(rmse, batch + 'rmse' + file_extentions)

                smse = np.zeros(batch_size)
                smse = smse.transpose()
                smse = pd.Series(smse)
                fileio.save(smse, batch + 'smse' + file_extentions)

                expv = np.zeros(batch_size)
                expv = expv.transpose()
                expv = pd.Series(expv)
                fileio.save(expv, batch + 'expv' + file_extentions)

                msll = np.zeros(batch_size)
                msll = msll.transpose()
                msll = pd.Series(msll)
                fileio.save(msll, batch + 'msll' + file_extentions)

                yhat = np.zeros([batch_size, numsubjects])
                yhat = pd.DataFrame(yhat)
                fileio.save(yhat, batch + 'yhat' + file_extentions)

                ys2 = np.zeros([batch_size, numsubjects])
                ys2 = pd.DataFrame(ys2)
                fileio.save(ys2, batch + 'ys2' + file_extentions)

                Z = np.zeros([batch_size, numsubjects])
                Z = pd.DataFrame(Z)
                fileio.save(Z, batch + 'Z' + file_extentions)

                for n in range(1, n_crossval + 1):
                    hyp = np.zeros([batch_size, nHyp])
                    hyp = pd.DataFrame(hyp)
                    fileio.save(hyp, batch + 'hyp' + file_extentions)
        else:  # if more than 10% of yhat is nan then consider the batch as a failed batch
            yhat = fileio.load(filepath[0])
            if np.count_nonzero(~np.isnan(yhat)) / (np.prod(yhat.shape)) < 0.9:
                count = count + 1
                batch1 = glob.glob(batch + '/*.sh')
                print('More than 10% nans in ' + batch1[0])
                batch_fail.append(batch1)

    # list batches that were not executed
    print('Number of batches that failed:' + str(count))
    batch_fail_df = pd.DataFrame(batch_fail)
    if file_extentions == '.txt':
        fileio.save_pd(batch_fail_df,
                       processing_dir + 'failed_batches' + file_extentions)
    else:
        fileio.save(batch_fail_df,
                    processing_dir + 'failed_batches' + file_extentions)

    # combines all output files across batches
    if collect is True:
        pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho*')
        if pRho_filenames:
            pRho_filenames = fileio.sort_nicely(pRho_filenames)
            pRho_dfs = []
            for pRho_filename in pRho_filenames:
                pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename)))
            pRho_combined = pd.concat(pRho_dfs, ignore_index=True)
            fileio.save(pRho_combined,
                        processing_dir + 'pRho' + file_extentions)

        Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho*')
        if pRho_filenames:
            Rho_filenames = fileio.sort_nicely(Rho_filenames)
            Rho_dfs = []
            for Rho_filename in Rho_filenames:
                Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename)))
            Rho_combined = pd.concat(Rho_dfs, ignore_index=True)
            fileio.save(Rho_combined, processing_dir + 'Rho' + file_extentions)

        Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z*')
        if Z_filenames:
            Z_filenames = fileio.sort_nicely(Z_filenames)
            Z_dfs = []
            for Z_filename in Z_filenames:
                Z_dfs.append(pd.DataFrame(fileio.load(Z_filename)))
            Z_combined = pd.concat(Z_dfs, ignore_index=True)
            fileio.save(Z_combined, processing_dir + 'Z' + file_extentions)

        yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat*')
        if yhat_filenames:
            yhat_filenames = fileio.sort_nicely(yhat_filenames)
            yhat_dfs = []
            for yhat_filename in yhat_filenames:
                yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename)))
            yhat_combined = pd.concat(yhat_dfs, ignore_index=True)
            fileio.save(yhat_combined,
                        processing_dir + 'yhat' + file_extentions)

        ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2*')
        if ys2_filenames:
            ys2_filenames = fileio.sort_nicely(ys2_filenames)
            ys2_dfs = []
            for ys2_filename in ys2_filenames:
                ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename)))
            ys2_combined = pd.concat(ys2_dfs, ignore_index=True)
            fileio.save(ys2_combined, processing_dir + 'ys2' + file_extentions)

        rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'rmse*')
        if rmse_filenames:
            rmse_filenames = fileio.sort_nicely(rmse_filenames)
            rmse_dfs = []
            for rmse_filename in rmse_filenames:
                rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename)))
            rmse_combined = pd.concat(rmse_dfs, ignore_index=True)
            fileio.save(rmse_combined,
                        processing_dir + 'rmse' + file_extentions)

        smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'smse*')
        if rmse_filenames:
            smse_filenames = fileio.sort_nicely(smse_filenames)
            smse_dfs = []
            for smse_filename in smse_filenames:
                smse_dfs.append(pd.DataFrame(fileio.load(smse_filename)))
            smse_combined = pd.concat(smse_dfs, ignore_index=True)
            fileio.save(smse_combined,
                        processing_dir + 'smse' + file_extentions)

        expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'expv*')
        if expv_filenames:
            expv_filenames = fileio.sort_nicely(expv_filenames)
            expv_dfs = []
            for expv_filename in expv_filenames:
                expv_dfs.append(pd.DataFrame(fileio.load(expv_filename)))
            expv_combined = pd.concat(expv_dfs, ignore_index=True)
            fileio.save(expv_combined,
                        processing_dir + 'expv' + file_extentions)

        msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'msll*')
        if msll_filenames:
            msll_filenames = fileio.sort_nicely(msll_filenames)
            msll_dfs = []
            for msll_filename in msll_filenames:
                msll_dfs.append(pd.DataFrame(fileio.load(msll_filename)))
            msll_combined = pd.concat(msll_dfs, ignore_index=True)
            fileio.save(msll_combined,
                        processing_dir + 'msll' + file_extentions)

        for n in range(1, n_crossval + 1):
            Hyp_filenames = glob.glob(processing_dir + 'batch_*/' + 'Hyp_' +
                                      str(n) + '.*')
            if Hyp_filenames:
                Hyp_filenames = fileio.sort_nicely(Hyp_filenames)
                Hyp_dfs = []
                for Hyp_filename in Hyp_filenames:
                    Hyp_dfs.append(pd.DataFrame(fileio.load(Hyp_filename)))
                Hyp_combined = pd.concat(Hyp_dfs, ignore_index=True)
                fileio.save(Hyp_combined,
                            processing_dir + 'Hyp_' + str(n) + file_extentions)
Exemplo n.º 2
0
def split_nm(processing_dir,
             respfile_path,
             batch_size,
             binary,
             testrespfile_path=None):
    """ This function prepares the input files for normative_parallel.

    ** Input:
        * processing_dir    -> Full path to the folder of processing
        * respfile_path     -> Full path to the responsefile.txt
                               (subjects x features)
        * batch_size        -> Number of features in each batch
        * testrespfile_path -> Full path to the test responsefile.txt
                               (subjects x features)
        * binary            -> If True binary file

    ** Output:
        * The creation of a folder struture for batch-wise processing

    witten by (primarily) T Wolfers (adapted) SM Kia
    """

    # import of necessary modules
    import os
    import sys
    import numpy as np
    import pandas as pd

    try:
        import nispat.fileio as fileio
    except ImportError:
        pass
        path = os.path.abspath(os.path.dirname(__file__))
        if path not in sys.path:
            sys.path.append(path)
            del path
        import fileio

    dummy, respfile_extension = os.path.splitext(respfile_path)
    if (binary and respfile_extension != '.pkl'):
        raise (ValueError, """If binary is True the file format for the
              testrespfile file must be .pkl""")
    elif (binary == False and respfile_extension != '.txt'):
        raise (ValueError, """If binary is False the file format for the
              testrespfile file must be .txt""")

    # splits response into batches
    if testrespfile_path is None:
        if (binary == False):
            respfile = fileio.load_ascii(respfile_path)
        else:
            respfile = pd.read_pickle(respfile_path)

        respfile = pd.DataFrame(respfile)

        numcol = len(respfile.iloc[0, :])
        batch_vec = np.arange(0, numcol, batch_size)
        batch_vec = np.append(batch_vec, numcol)
        for n in range(0, (len(batch_vec) - 1)):
            resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]]
            os.chdir(processing_dir)
            resp = str('resp_batch_' + str(n + 1))
            batch = str('batch_' + str(n + 1))
            if not os.path.exists(processing_dir + batch):
                os.makedirs(processing_dir + batch)
                if (binary == False):
                    fileio.save_pd(
                        resp_batch,
                        processing_dir + batch + '/' + resp + '.txt')
                else:
                    resp_batch.to_pickle(processing_dir + batch + '/' + resp +
                                         '.pkl')

    # splits response and test responsefile into batches
    else:
        dummy, testrespfile_extension = os.path.splitext(testrespfile_path)
        if (binary and testrespfile_extension != '.pkl'):
            raise (ValueError, """If binary is True the file format for the
                  testrespfile file must be .pkl""")
        elif (binary == False and testrespfile_extension != '.txt'):
            raise (ValueError, """If binary is False the file format for the
                  testrespfile file must be .txt""")

        if (binary == False):
            respfile = fileio.load_ascii(respfile_path)
            testrespfile = fileio.load_ascii(testrespfile_path)
        else:
            respfile = pd.read_pickle(respfile_path)
            testrespfile = pd.read_pickle(testrespfile_path)

        respfile = pd.DataFrame(respfile)
        testrespfile = pd.DataFrame(testrespfile)

        numcol = len(respfile.iloc[0, :])
        batch_vec = np.arange(0, numcol, batch_size)
        batch_vec = np.append(batch_vec, numcol)
        for n in range(0, (len(batch_vec) - 1)):
            resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]]
            testresp_batch = testrespfile.iloc[:,
                                               (batch_vec[n]):batch_vec[n + 1]]
            os.chdir(processing_dir)
            resp = str('resp_batch_' + str(n + 1))
            testresp = str('testresp_batch_' + str(n + 1))
            batch = str('batch_' + str(n + 1))
            if not os.path.exists(processing_dir + batch):
                os.makedirs(processing_dir + batch)
                if (binary == False):
                    fileio.save_pd(
                        resp_batch,
                        processing_dir + batch + '/' + resp + '.txt')
                    fileio.save_pd(
                        testresp_batch,
                        processing_dir + batch + '/' + testresp + '.txt')
                else:
                    resp_batch.to_pickle(processing_dir + batch + '/' + resp +
                                         '.pkl')
                    testresp_batch.to_pickle(processing_dir + batch + '/' +
                                             testresp + '.pkl')
Exemplo n.º 3
0
def collect_nm(processing_dir,
               job_name,
               func='estimate',
               collect=False,
               binary=False,
               batch_size=None,
               outputsuffix=''):
    """This function checks and collects all batches.

    ** Input:
        * processing_dir        -> Full path to the processing directory
        * collect               -> If True data is checked for failed batches
                                and collected; if False data is just checked

    ** Output:
        * Text files containing all results accross all batches the combined
          output

    written by (primarily) T Wolfers, (adapted) SM Kia
    """

    if binary:
        file_extentions = '.pkl'
    else:
        file_extentions = '.txt'

    # detect number of subjects, batches, hyperparameters and CV
    batches = glob.glob(processing_dir + 'batch_*/')
    file_example = []
    for batch in batches:
        if file_example == []:
            file_example = glob.glob(batch + 'yhat' + outputsuffix +
                                     file_extentions)
        else:
            break
    if binary is False:
        file_example = fileio.load(file_example[0])
    else:
        file_example = pd.read_pickle(file_example[0])
    numsubjects = file_example.shape[0]
    batch_size = file_example.shape[1]

    # artificially creates files for batches that were not executed
    count = 0
    batch_fail = []
    batch_dirs = glob.glob(processing_dir + 'batch_*/')
    batch_dirs = fileio.sort_nicely(batch_dirs)
    for batch in batch_dirs:
        filepath = glob.glob(batch + 'yhat' + outputsuffix + '*')
        if filepath == []:
            count = count + 1
            batch1 = glob.glob(batch + '/' + job_name + '*.sh')
            print(batch1)
            batch_fail.append(batch1)
            if collect is True:
                pRho = np.ones(batch_size)
                pRho = pRho.transpose()
                pRho = pd.Series(pRho)
                fileio.save(pRho,
                            batch + 'pRho' + outputsuffix + file_extentions)

                Rho = np.zeros(batch_size)
                Rho = Rho.transpose()
                Rho = pd.Series(Rho)
                fileio.save(Rho,
                            batch + 'Rho' + outputsuffix + file_extentions)

                rmse = np.zeros(batch_size)
                rmse = rmse.transpose()
                rmse = pd.Series(rmse)
                fileio.save(rmse,
                            batch + 'RMSE' + outputsuffix + file_extentions)

                smse = np.zeros(batch_size)
                smse = smse.transpose()
                smse = pd.Series(smse)
                fileio.save(smse,
                            batch + 'SMSE' + outputsuffix + file_extentions)

                expv = np.zeros(batch_size)
                expv = expv.transpose()
                expv = pd.Series(expv)
                fileio.save(expv,
                            batch + 'EXPV' + outputsuffix + file_extentions)

                msll = np.zeros(batch_size)
                msll = msll.transpose()
                msll = pd.Series(msll)
                fileio.save(msll,
                            batch + 'MSLL' + outputsuffix + file_extentions)

                yhat = np.zeros([numsubjects, batch_size])
                yhat = pd.DataFrame(yhat)
                fileio.save(yhat,
                            batch + 'yhat' + outputsuffix + file_extentions)

                ys2 = np.zeros([numsubjects, batch_size])
                ys2 = pd.DataFrame(ys2)
                fileio.save(ys2,
                            batch + 'ys2' + outputsuffix + file_extentions)

                Z = np.zeros([numsubjects, batch_size])
                Z = pd.DataFrame(Z)
                fileio.save(Z, batch + 'Z' + outputsuffix + file_extentions)

                if not os.path.isdir(batch + 'Models'):
                    os.mkdir('Models')

        else:  # if more than 10% of yhat is nan then consider the batch as a failed batch
            yhat = fileio.load(filepath[0])
            if np.count_nonzero(~np.isnan(yhat)) / (np.prod(yhat.shape)) < 0.9:
                count = count + 1
                batch1 = glob.glob(batch + '/' + job_name + '*.sh')
                print('More than 10% nans in ' + batch1[0])
                batch_fail.append(batch1)

    # list batches that were not executed
    print('Number of batches that failed:' + str(count))
    batch_fail_df = pd.DataFrame(batch_fail)
    if file_extentions == '.txt':
        fileio.save_pd(batch_fail_df,
                       processing_dir + 'failed_batches' + file_extentions)
    else:
        fileio.save(batch_fail_df,
                    processing_dir + 'failed_batches' + file_extentions)

    # combines all output files across batches
    if collect is True:
        pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho' +
                                   outputsuffix + '*')
        if pRho_filenames:
            pRho_filenames = fileio.sort_nicely(pRho_filenames)
            pRho_dfs = []
            for pRho_filename in pRho_filenames:
                pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename)))
            pRho_dfs = pd.concat(pRho_dfs, ignore_index=True, axis=0)
            fileio.save(
                pRho_dfs,
                processing_dir + 'pRho' + outputsuffix + file_extentions)
            del pRho_dfs

        Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho' +
                                  outputsuffix + '*')
        if pRho_filenames:
            Rho_filenames = fileio.sort_nicely(Rho_filenames)
            Rho_dfs = []
            for Rho_filename in Rho_filenames:
                Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename)))
            Rho_dfs = pd.concat(Rho_dfs, ignore_index=True, axis=0)
            fileio.save(
                Rho_dfs,
                processing_dir + 'Rho' + outputsuffix + file_extentions)
            del Rho_dfs

        Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z' +
                                outputsuffix + '*')
        if Z_filenames:
            Z_filenames = fileio.sort_nicely(Z_filenames)
            Z_dfs = []
            for Z_filename in Z_filenames:
                Z_dfs.append(pd.DataFrame(fileio.load(Z_filename)))
            Z_dfs = pd.concat(Z_dfs, ignore_index=True, axis=1)
            fileio.save(Z_dfs,
                        processing_dir + 'Z' + outputsuffix + file_extentions)
            del Z_dfs

        yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat' +
                                   outputsuffix + '*')
        if yhat_filenames:
            yhat_filenames = fileio.sort_nicely(yhat_filenames)
            yhat_dfs = []
            for yhat_filename in yhat_filenames:
                yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename)))
            yhat_dfs = pd.concat(yhat_dfs, ignore_index=True, axis=1)
            fileio.save(
                yhat_dfs,
                processing_dir + 'yhat' + outputsuffix + file_extentions)
            del yhat_dfs

        ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2' +
                                  outputsuffix + '*')
        if ys2_filenames:
            ys2_filenames = fileio.sort_nicely(ys2_filenames)
            ys2_dfs = []
            for ys2_filename in ys2_filenames:
                ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename)))
            ys2_dfs = pd.concat(ys2_dfs, ignore_index=True, axis=1)
            fileio.save(
                ys2_dfs,
                processing_dir + 'ys2' + outputsuffix + file_extentions)
            del ys2_dfs

        rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'RMSE' +
                                   outputsuffix + '*')
        if rmse_filenames:
            rmse_filenames = fileio.sort_nicely(rmse_filenames)
            rmse_dfs = []
            for rmse_filename in rmse_filenames:
                rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename)))
            rmse_dfs = pd.concat(rmse_dfs, ignore_index=True, axis=0)
            fileio.save(
                rmse_dfs,
                processing_dir + 'RMSE' + outputsuffix + file_extentions)
            del rmse_dfs

        smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'SMSE' +
                                   outputsuffix + '*')
        if rmse_filenames:
            smse_filenames = fileio.sort_nicely(smse_filenames)
            smse_dfs = []
            for smse_filename in smse_filenames:
                smse_dfs.append(pd.DataFrame(fileio.load(smse_filename)))
            smse_dfs = pd.concat(smse_dfs, ignore_index=True, axis=0)
            fileio.save(
                smse_dfs,
                processing_dir + 'SMSE' + outputsuffix + file_extentions)
            del smse_dfs

        expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'EXPV' +
                                   outputsuffix + '*')
        if expv_filenames:
            expv_filenames = fileio.sort_nicely(expv_filenames)
            expv_dfs = []
            for expv_filename in expv_filenames:
                expv_dfs.append(pd.DataFrame(fileio.load(expv_filename)))
            expv_dfs = pd.concat(expv_dfs, ignore_index=True, axis=0)
            fileio.save(
                expv_dfs,
                processing_dir + 'EXPV' + outputsuffix + file_extentions)
            del expv_dfs

        msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'MSLL' +
                                   outputsuffix + '*')
        if msll_filenames:
            msll_filenames = fileio.sort_nicely(msll_filenames)
            msll_dfs = []
            for msll_filename in msll_filenames:
                msll_dfs.append(pd.DataFrame(fileio.load(msll_filename)))
            msll_dfs = pd.concat(msll_dfs, ignore_index=True, axis=0)
            fileio.save(
                msll_dfs,
                processing_dir + 'MSLL' + outputsuffix + file_extentions)
            del msll_dfs

        if func != 'predict':
            if not os.path.isdir(processing_dir + 'Models') and \
               os.path.exists(os.path.join(batches[0], 'Models')):
                os.mkdir(processing_dir + 'Models')

            meta_filenames = glob.glob(processing_dir + 'batch_*/Models/' +
                                       'meta_data.md')
            mY = []
            sY = []
            mX = []
            sX = []
            if meta_filenames:
                meta_filenames = fileio.sort_nicely(meta_filenames)
                with open(meta_filenames[0], 'rb') as file:
                    meta_data = pickle.load(file)
                if meta_data['standardize']:
                    for meta_filename in meta_filenames:
                        mY.append(meta_data['mean_resp'])
                        sY.append(meta_data['std_resp'])
                        mX.append(meta_data['mean_cov'])
                        sX.append(meta_data['std_cov'])
                    meta_data['mean_resp'] = np.stack(mY)
                    meta_data['std_resp'] = np.stack(sY)
                    meta_data['mean_cov'] = np.stack(mX)
                    meta_data['std_cov'] = np.stack(sX)

                with open(
                        os.path.join(processing_dir, 'Models', 'meta_data.md'),
                        'wb') as file:
                    pickle.dump(meta_data, file)

            batch_dirs = glob.glob(processing_dir + 'batch_*/')
            if batch_dirs:
                batch_dirs = fileio.sort_nicely(batch_dirs)
                for b, batch_dir in enumerate(batch_dirs):
                    src_files = glob.glob(batch_dir + 'Models/*.pkl')
                    src_files = fileio.sort_nicely(src_files)
                    for f, full_file_name in enumerate(src_files):
                        if os.path.isfile(full_file_name):
                            file_name = full_file_name.split('/')[-1]
                            n = file_name.split('_')
                            n[-1] = str(b * batch_size + f) + '.pkl'
                            n = '_'.join(n)
                            shutil.copy(full_file_name,
                                        processing_dir + 'Models/' + n)

    if not batch_fail:
        return 1
    else:
        return 0
Exemplo n.º 4
0
def split_nm(processing_dir, respfile_path, batch_size, binary, **kwargs):
    """ This function prepares the input files for normative_parallel.

    ** Input:
        * processing_dir    -> Full path to the folder of processing
        * respfile_path     -> Full path to the responsefile.txt
                               (subjects x features)
        * batch_size        -> Number of features in each batch
        * testrespfile_path -> Full path to the test responsefile.txt
                               (subjects x features)
        * binary            -> If True binary file

    ** Output:
        * The creation of a folder struture for batch-wise processing

    witten by (primarily) T Wolfers (adapted) SM Kia
    """

    testrespfile_path = kwargs.pop('testrespfile_path', None)

    dummy, respfile_extension = os.path.splitext(respfile_path)
    if (binary and respfile_extension != '.pkl'):
        raise (ValueError, """If binary is True the file format for the
              testrespfile file must be .pkl""")
    elif (binary == False and respfile_extension != '.txt'):
        raise (ValueError, """If binary is False the file format for the
              testrespfile file must be .txt""")

    # splits response into batches
    if testrespfile_path is None:
        if (binary == False):
            respfile = fileio.load_ascii(respfile_path)
        else:
            respfile = pd.read_pickle(respfile_path)

        respfile = pd.DataFrame(respfile)

        numsub = respfile.shape[1]
        batch_vec = np.arange(0, numsub, batch_size)
        batch_vec = np.append(batch_vec, numsub)

        for n in range(0, (len(batch_vec) - 1)):
            resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]]
            os.chdir(processing_dir)
            resp = str('resp_batch_' + str(n + 1))
            batch = str('batch_' + str(n + 1))
            if not os.path.exists(processing_dir + batch):
                os.makedirs(processing_dir + batch)
            if (binary == False):
                fileio.save_pd(resp_batch,
                               processing_dir + batch + '/' + resp + '.txt')
            else:
                resp_batch.to_pickle(processing_dir + batch + '/' + resp +
                                     '.pkl')

    # splits response and test responsefile into batches
    else:
        dummy, testrespfile_extension = os.path.splitext(testrespfile_path)
        if (binary and testrespfile_extension != '.pkl'):
            raise (ValueError, """If binary is True the file format for the
                  testrespfile file must be .pkl""")
        elif (binary == False and testrespfile_extension != '.txt'):
            raise (ValueError, """If binary is False the file format for the
                  testrespfile file must be .txt""")

        if (binary == False):
            respfile = fileio.load_ascii(respfile_path)
            testrespfile = fileio.load_ascii(testrespfile_path)
        else:
            respfile = pd.read_pickle(respfile_path)
            testrespfile = pd.read_pickle(testrespfile_path)

        respfile = pd.DataFrame(respfile)
        testrespfile = pd.DataFrame(testrespfile)

        numsub = respfile.shape[1]
        batch_vec = np.arange(0, numsub, batch_size)
        batch_vec = np.append(batch_vec, numsub)
        for n in range(0, (len(batch_vec) - 1)):
            resp_batch = respfile.iloc[:, (batch_vec[n]):batch_vec[n + 1]]
            testresp_batch = testrespfile.iloc[:,
                                               (batch_vec[n]):batch_vec[n + 1]]
            os.chdir(processing_dir)
            resp = str('resp_batch_' + str(n + 1))
            testresp = str('testresp_batch_' + str(n + 1))
            batch = str('batch_' + str(n + 1))
            if not os.path.exists(processing_dir + batch):
                os.makedirs(processing_dir + batch)
            if (binary == False):
                fileio.save_pd(resp_batch,
                               processing_dir + batch + '/' + resp + '.txt')
                fileio.save_pd(
                    testresp_batch,
                    processing_dir + batch + '/' + testresp + '.txt')
            else:
                resp_batch.to_pickle(processing_dir + batch + '/' + resp +
                                     '.pkl')
                testresp_batch.to_pickle(processing_dir + batch + '/' +
                                         testresp + '.pkl')
Exemplo n.º 5
0
def collect_nm(processing_dir):
    """This function checks and collects all batches.

    ** Input:
        * processing_dir        -> Full path to the processing directory

    ** Output:
        * Text files containing all results accross all batches the combined
          output

    written by Thomas Wolfers
    """
    # import of necessary modules
    import glob
    import numpy as np
    import os
    import pandas as pd
    import nispat.fileio as fileio

    # detect number of subjects, batches, hyperparameters and CV
    file_example = glob.glob(processing_dir + 'batch_1/' + 'resp*.txt')
    file_example = fileio.load_pd(file_example[0])
    numsubjects = file_example.shape[0]
    batch_size = file_example.shape[1]

    all_Hyptxt = glob.glob(processing_dir + 'batch_*/' + 'Hyp_*')
    first_Hyptxt = fileio.load_pd(all_Hyptxt[0])

    first_Hyptxt = first_Hyptxt.transpose()
    nHyp = len(first_Hyptxt)

    dir_first_Hyptxt = os.path.dirname(all_Hyptxt[0])
    all_crossval = glob.glob(dir_first_Hyptxt + '/' + 'Hyp_*')
    n_crossval = len(all_crossval)

    # artificially creates files for batches that were not executed
    count = 0
    batch_fail = []
    for batch in glob.glob(processing_dir + 'batch_*/'):
        filepath = glob.glob(batch + 'pRho*')
        if filepath == []:
            pRho = np.ones(batch_size)
            pRho = pRho.transpose()
            pRho = pd.Series(pRho)
            fileio.save_pd(pRho, batch + 'pRho.txt')

            Rho = np.zeros(batch_size)
            Rho = Rho.transpose()
            Rho = pd.Series(Rho)
            fileio.save_pd(Rho, batch + 'Rho.txt')

            rmse = np.zeros(batch_size)
            rmse = rmse.transpose()
            rmse = pd.Series(rmse)
            fileio.save_pd(rmse, batch + 'rmse.txt')

            smse = np.zeros(batch_size)
            smse = smse.transpose()
            smse = pd.Series(smse)
            fileio.save_pd(smse, batch + 'smse.txt')

            Z = np.zeros([batch_size, numsubjects])
            Z = pd.DataFrame(Z)
            fileio.save_pd(Z, batch + 'Z.txt')

            yhat = np.zeros([batch_size, numsubjects])
            yhat = pd.DataFrame(yhat)
            fileio.save_pd(yhat, batch + 'yhat.txt')

            ys2 = np.zeros([batch_size, numsubjects])
            ys2 = pd.DataFrame(ys2)
            fileio.save_pd(ys2, batch + 'ys2.txt')

            for n in range(1, n_crossval + 1):
                hyp = np.zeros([batch_size, nHyp])
                hyp = pd.DataFrame(hyp)
                fileio.save_pd(hyp, batch + 'Hyp_' + str(n) + '.txt')

            count = count + 1
            print(batch)
            batch_fail.append(batch)

    # list batches that were not executed
    print('Number of batches that failed:' + str(count))
    batch_fail_df = pd.DataFrame(batch_fail)
    fileio.save_pd(batch_fail_df, processing_dir + 'failed_batches.txt')

    # combines all output files across batches
    pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho*')
    pRho_filenames = fileio.sort_nicely(pRho_filenames)
    pRho_dfs = []
    for pRho_filename in pRho_filenames:
        pRho_dfs.append(fileio.load_pd(pRho_filename))
    pRho_combined = pd.concat(pRho_dfs, ignore_index=True)
    fileio.save_pd(pRho_combined, processing_dir + 'pRho.txt')

    Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho*')
    Rho_filenames = fileio.sort_nicely(Rho_filenames)
    Rho_dfs = []
    for Rho_filename in Rho_filenames:
        Rho_dfs.append(fileio.load_pd(Rho_filename))
    Rho_combined = pd.concat(Rho_dfs, ignore_index=True)
    fileio.save_pd(Rho_combined, processing_dir + 'Rho.txt')

    Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z*')
    Z_filenames = fileio.sort_nicely(Z_filenames)
    Z_dfs = []
    for Z_filename in Z_filenames:
        Z_dfs.append(fileio.load_pd(Z_filename))
    Z_combined = pd.concat(Z_dfs, ignore_index=True)
    fileio.save_pd(Z_combined, processing_dir + 'Z.txt')

    yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat*')
    yhat_filenames = fileio.sort_nicely(yhat_filenames)
    yhat_dfs = []
    for yhat_filename in yhat_filenames:
        yhat_dfs.append(fileio.load_pd(yhat_filename))
    yhat_combined = pd.concat(yhat_dfs, ignore_index=True)
    fileio.save_pd(yhat_combined, processing_dir + 'yhat.txt')

    ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2*')
    ys2_filenames = fileio.sort_nicely(ys2_filenames)
    ys2_dfs = []
    for ys2_filename in ys2_filenames:
        ys2_dfs.append(fileio.load_pd(ys2_filename))
    ys2_combined = pd.concat(ys2_dfs, ignore_index=True)
    fileio.save_pd(ys2_combined, processing_dir + 'ys2.txt')

    rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'rmse*')
    rmse_filenames = fileio.sort_nicely(rmse_filenames)
    rmse_dfs = []
    for rmse_filename in rmse_filenames:
        rmse_dfs.append(fileio.load_pd(rmse_filename))
    rmse_combined = pd.concat(rmse_dfs, ignore_index=True)
    fileio.save_pd(rmse_combined, processing_dir + 'rmse.txt')

    smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'smse*')
    smse_filenames = fileio.sort_nicely(smse_filenames)
    smse_dfs = []
    for smse_filename in smse_filenames:
        smse_dfs.append(fileio.load_pd(smse_filename))
    smse_combined = pd.concat(smse_dfs, ignore_index=True)
    fileio.save_pd(smse_combined, processing_dir + 'smse.txt')

    for n in range(1, n_crossval + 1):
        Hyp_filenames = glob.glob(processing_dir + 'batch_*/' + 'Hyp_' +
                                  str(n) + '.*')
        Hyp_filenames = fileio.sort_nicely(Hyp_filenames)
        Hyp_dfs = []
        for Hyp_filename in Hyp_filenames:
            Hyp_dfs.append(fileio.load_pd(Hyp_filename))
        Hyp_combined = pd.concat(Hyp_dfs, ignore_index=True)
        fileio.save_pd(Hyp_combined, processing_dir + 'Hyp_' + str(n) + '.txt')
Exemplo n.º 6
0
def split_nm(processing_dir,
             respfile_path,
             batch_size,
             testrespfile_path=None):
    """ This function prepares the input files for parallel normative modelling.

    ** Input:
        * processing_dir    -> Full path to the folder of processing
        * respfile_path     -> Full path to the responsefile.txt
                               (subjects x features)
        * batch_size        -> Number of features in each batch
        * testrespfile_path -> Full path to the test responsefile.txt
                               (subjects x features)

    ** Output:
        * The creation of a folder struture for batch-wise processing

    witten by Thomas Wolfers
    """

    # import of necessary modules
    import numpy as np
    import os
    import nispat.fileio as fileio

    # splits response into batches
    if testrespfile_path is None:
        respfile = fileio.load_pd(respfile_path)
        numsub = len(respfile.ix[0, :])
        batch_vec = np.arange(0, numsub, batch_size)
        batch_vec = np.append(batch_vec, numsub)
        batch_vec = batch_vec - 1
        for n in range(0, (len(batch_vec) - 1)):
            resp_batch = respfile.ix[:, (batch_vec[n] + 1):batch_vec[n + 1]]
            os.chdir(processing_dir)
            resp = str('resp_batch_' + str(n + 1))
            batch = str('batch_' + str(n + 1))
            if not os.path.exists(processing_dir + batch):
                os.makedirs(processing_dir + batch)
                fileio.save_pd(resp_batch,
                               processing_dir + batch + '/' + resp + '.txt')

    # splits response and test responsefile into batches
    else:
        respfile = fileio.load_pd(respfile_path)
        testrespfile = fileio.load_pd(testrespfile_path)
        numsub = len(respfile.ix[0, :])
        batch_vec = np.arange(0, numsub, batch_size)
        batch_vec = np.append(batch_vec, numsub)
        batch_vec = batch_vec - 1
        for n in range(0, (len(batch_vec) - 1)):
            resp_batch = respfile.ix[:, (batch_vec[n] + 1):batch_vec[n + 1]]
            testresp_batch = testrespfile.ix[:, (batch_vec[n] +
                                                 1):batch_vec[n + 1]]
            os.chdir(processing_dir)
            resp = str('resp_batch_' + str(n + 1))
            testresp = str('testresp_batch_' + str(n + 1))
            batch = str('batch_' + str(n + 1))
            if not os.path.exists(processing_dir + batch):
                os.makedirs(processing_dir + batch)
                fileio.save_pd(resp_batch,
                               processing_dir + batch + '/' + resp + '.txt')
                fileio.save_pd(
                    testresp_batch,
                    processing_dir + batch + '/' + testresp + '.txt')