示例#1
0
def main(RUNID='run001',
         START_DT_STR=None,
         MODELFILENAME='sm',
         PICKLE_DATA=False,
         DO_TESTS=False,
         PROFILE=False,
         verbose=False,
         debug=False,
         RELOAD=False,
         n_cpus=1,
         PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']):
    '''
    Runs our series model or triggered series model job based on the runtime
    conditions and run parameters.

    IN:
        RUNID - str - str name for the folder where output will be stored and the
            name of the json (without extension) containing run parameters
            for seriesmodel or triggeredseriesmodel
        START_DT_STR - str - timestamp as a string to append to the logfile.
            Set in the header global params of capstone
        MODELFILENAME - str - filename of model (for pickling)
        PICKLE_DATA - bool - if the raw data should be pickled after loading into
            a data frame
        DO_TESTS - bool - if unittests should be run (True), or a job run (False)
        PROFILE - bool - if memory profiling should be performed (True)
        verbose - bool - when set to true, verbose output
        debug - bool - whether a full dataset should be used (False), or a smaller
            set of time points (True)
        RELOAD - bool - whether data should be loaded from pickle (False), or
            reloaded from raw data (True).  Set to true only for first run on a
            new instance, then set to False for future runs to save load time.
        n_cpus - int - number of cpus to use for multiprocessing jobs.
        PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels)
            data and spots_used file names.  When RELOAD is set to True, this is
            the filenames where this data will be saved.  When RELOAD is set to
            False, this is where the data will be loaded from.
    OUT:
        None
    '''

    RUNID = command_line_process(RUNID)
    # prepare to run job
    LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR)
    LOGFILE = create_logfile(RUNID, LOGFILENAME)

    # get the run conditions for the runid from the json
    # NOTE excludes verbose and debug flags - those are fit parameters
    # and exludes runid since that is set up above
    with open((RUNID + '.json')) as f:
        run_params = json.load(f, object_hook=ascii_encode_dict)

    # to see if more ram is used for more cpus
    n_jobs = run_params['detection_model_arguments']['n_jobs']

    ### Unittests ###
    if DO_TESTS:
        start = time.time()
        ptf('\n>> Unpickling data ...\n', LOGFILE)
        X = my_unpickle(PICKLE_NAMES[0])
        y = my_unpickle(PICKLE_NAMES[1])
        used_column_headers = my_unpickle(PICKLE_NAMES[2])

        end = time.time()
        ptf(
            'Data unpickled in %d seconds (%d total trials)' %
            ((end - start), len(X)), LOGFILE)

        tsm_unit = run_tsm_unittests(X,
                                     y,
                                     used_column_headers.values,
                                     verbose=verbose,
                                     logfile=LOGFILE)
        # sm_unit = run_unittests(X_test, y_test, verbose=False)
    else:
        # ouptput run conditions to screen and logfile
        bigstart = time.time()

        # start memory profiling
        if PROFILE:
            tr, tr_sm = start_memory_profiling

        if RUNTYPE == 'trigger':
            ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID)
        elif RUNTYPE == 'series':
            ptf('*** %s - SERIES MODEL - ***' % RUNID)

        print_job_info(run_params,
                       n_jobs,
                       n_cpus,
                       RUNID,
                       START_DT_STR,
                       LOGFILE=LOGFILE,
                       debug=debug,
                       profile=PROFILE,
                       verbose=verbose,
                       start=True)

        if RELOAD:
            X, y, used_column_headers, df, df_raw = reload_data(
                LOGFILE, PICKLE_DATA)
        else:
            start = time.time()
            ptf('\n>> Unpickling data ...\n', LOGFILE)
            X = my_unpickle(PICKLE_NAMES[0])
            y = my_unpickle(PICKLE_NAMES[1])
            used_column_headers = my_unpickle(PICKLE_NAMES[2])

            end = time.time()
            ptf(
                'Data unpickled in %d seconds (%d total trials)' %
                ((end - start), len(X)), LOGFILE)

        run_params['logfile'] = LOGFILE
        run_params['runid'] = RUNID

        # create model
        if RUNTYPE == 'trigger':
            sm = TriggeredSeriesModel(used_column_headers.values, **run_params)
        elif RUNTYPE == 'series':
            sm = SeriesModel(**run_params)

        # Altogether now
        print('** DOING THE FIT **')
        sm.fit(X, y, verbose=verbose, debug=debug)

        bigend = time.time()

        ptf(
            '====> %d seconds (%0.1f mins)' % ((bigend - bigstart),
                                               (bigend - bigstart) / 60.0),
            LOGFILE)
        print_job_info(run_params,
                       n_jobs,
                       n_cpus,
                       RUNID,
                       START_DT_STR,
                       LOGFILE=LOGFILE,
                       debug=debug,
                       profile=PROFILE,
                       verbose=verbose,
                       start=False)

        print_run_details(X, sm, LOGFILE)

        save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE)

        ## VIEW RESULTS
        if RUNTYPE == 'trigger':
            make_trigger_plots(sm, y, RUNID, debug=debug)
        elif RUNTYPE == 'series':
            make_series_plots(sm)

        if PROFILE:
            print_memory_profiles(sm, tr, tr_sm, LOGFILE=None)

    LOGFILE.close()
示例#2
0
    LOGFILE = create_logfile(RUNID, LOGFILENAME)

    # get the run conditions for the runid from the json
    # NOTE excludes verbose and debug flags - those are fit parameters
    # and exludes runid since that is set up above
    with open((RUNID + '.json')) as f:
        run_params = json.load(f, object_hook=ascii_encode_dict)

    # to see if more ram is used for more cpus
    n_jobs = run_params['detection_model_arguments']['n_jobs']

    ### Unittests ###
    if DO_TESTS:
        start = time.time()
        ptf( '\n>> Unpickling data ...\n', LOGFILE)
        X = my_unpickle(PICKLE_NAMES[0])
        y = my_unpickle(PICKLE_NAMES[1])
        used_column_headers = my_unpickle(PICKLE_NAMES[2])

        end = time.time()
        ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE)

        tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE)
        # sm_unit = run_unittests(X_test, y_test, verbose=False)
    else:
        # ouptput run conditions to screen and logfile
        bigstart = time.time()

        # start memory profiling
        if PROFILE:
            tr, tr_sm = start_memory_profiling
def main(RUNID='run001', START_DT_STR=None, MODELFILENAME='sm', PICKLE_DATA=False,
    DO_TESTS=False, PROFILE=False, verbose=False, debug=False,
    RELOAD=False, n_cpus=1,
    PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']):
    '''
    Runs our series model or triggered series model job based on the runtime
    conditions and run parameters.

    IN:
        RUNID - str - str name for the folder where output will be stored and the
            name of the json (without extension) containing run parameters
            for seriesmodel or triggeredseriesmodel
        START_DT_STR - str - timestamp as a string to append to the logfile.
            Set in the header global params of capstone
        MODELFILENAME - str - filename of model (for pickling)
        PICKLE_DATA - bool - if the raw data should be pickled after loading into
            a data frame
        DO_TESTS - bool - if unittests should be run (True), or a job run (False)
        PROFILE - bool - if memory profiling should be performed (True)
        verbose - bool - when set to true, verbose output
        debug - bool - whether a full dataset should be used (False), or a smaller
            set of time points (True)
        RELOAD - bool - whether data should be loaded from pickle (False), or
            reloaded from raw data (True).  Set to true only for first run on a
            new instance, then set to False for future runs to save load time.
        n_cpus - int - number of cpus to use for multiprocessing jobs.
        PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels)
            data and spots_used file names.  When RELOAD is set to True, this is
            the filenames where this data will be saved.  When RELOAD is set to
            False, this is where the data will be loaded from.
    OUT:
        None
    '''

    RUNID = command_line_process(RUNID)
    # prepare to run job
    LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR)
    LOGFILE = create_logfile(RUNID, LOGFILENAME)

    # get the run conditions for the runid from the json
    # NOTE excludes verbose and debug flags - those are fit parameters
    # and exludes runid since that is set up above
    with open((RUNID + '.json')) as f:
        run_params = json.load(f, object_hook=ascii_encode_dict)

    # to see if more ram is used for more cpus
    n_jobs = run_params['detection_model_arguments']['n_jobs']

    ### Unittests ###
    if DO_TESTS:
        start = time.time()
        ptf( '\n>> Unpickling data ...\n', LOGFILE)
        X = my_unpickle(PICKLE_NAMES[0])
        y = my_unpickle(PICKLE_NAMES[1])
        used_column_headers = my_unpickle(PICKLE_NAMES[2])

        end = time.time()
        ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start),
            len(X)), LOGFILE)

        tsm_unit = run_tsm_unittests(X, y, used_column_headers.values,
            verbose=verbose, logfile=LOGFILE)
        # sm_unit = run_unittests(X_test, y_test, verbose=False)
    else:
        # ouptput run conditions to screen and logfile
        bigstart = time.time()

        # start memory profiling
        if PROFILE:
            tr, tr_sm = start_memory_profiling


        if RUNTYPE == 'trigger':
            ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID)
        elif RUNTYPE == 'series':
            ptf('*** %s - SERIES MODEL - ***' % RUNID)


        print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE,
            debug=debug, profile=PROFILE, verbose=verbose, start=True)

        if RELOAD:
            X, y, used_column_headers, df, df_raw = reload_data(LOGFILE, PICKLE_DATA)
        else:
            start = time.time()
            ptf( '\n>> Unpickling data ...\n', LOGFILE)
            X = my_unpickle(PICKLE_NAMES[0])
            y = my_unpickle(PICKLE_NAMES[1])
            used_column_headers = my_unpickle(PICKLE_NAMES[2])

            end = time.time()
            ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE)

        run_params['logfile'] = LOGFILE
        run_params['runid'] = RUNID

        # create model
        if RUNTYPE == 'trigger':
            sm = TriggeredSeriesModel(used_column_headers.values, **run_params)
        elif RUNTYPE == 'series':
            sm = SeriesModel(**run_params)

        # Altogether now
        print ('** DOING THE FIT **')
        sm.fit(X, y, verbose=verbose, debug=debug)

        bigend = time.time()

        ptf('====> %d seconds (%0.1f mins)' % ((bigend-bigstart), (bigend-bigstart)/60.0), LOGFILE)
        print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE,
            debug=debug, profile=PROFILE, verbose=verbose, start=False)

        print_run_details(X, sm, LOGFILE)

        save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE)

        ## VIEW RESULTS
        if RUNTYPE == 'trigger':
            make_trigger_plots(sm, y, RUNID, debug=debug)
        elif RUNTYPE == 'series':
            make_series_plots(sm)

        if PROFILE:
            print_memory_profiles(sm, tr, tr_sm, LOGFILE = None)

    LOGFILE.close()