Пример #1
0
def plot_probe_results(filename, outdir):
    log("Loading probe results from %s" % filename)
    with open(filename, "r") as f:
        results = pickle.load(f)
    results = [((probe, n_models, analysis_count_from_file_name(fname)), val)
               for ((fname, n_models, probe), val) in results.iteritems()]
    plot_results(results, outdir=outdir)
Пример #2
0
 def snapshot():
     log('vacuuming')
     bdb.sql_execute('vacuum')
     cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
     save_file_name = out_file_name('satellites', cur_infix + '.bdb')
     meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
     log('recording snapshot ' + save_file_name)
     os.system("cp %s %s" % (bdb_file, save_file_name))
     report(save_file_name, meta_file_name)
Пример #3
0
 def snapshot():
     log('vacuuming')
     bdb.sql_execute('vacuum')
     cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
     save_file_name = out_file_name('satellites', cur_infix + '.bdb')
     meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
     log('recording snapshot ' + save_file_name)
     os.system("cp %s %s" % (bdb_file, save_file_name))
     report(save_file_name, meta_file_name)
Пример #4
0
 def final_report():
     # create a diagnostics plot
     plot_file_name = out_file_name('satellites', '-logscores.pdf')
     log('writing diagnostic plot to %s' % plot_file_name)
     _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore',
                                                       'satellites_cc')
     plt.savefig(plot_file_name)
     final_metadata_file = out_file_name('satellites', '-meta.txt')
     report(bdb_file, final_metadata_file,
            echo=True, plot_file_name=plot_file_name)
Пример #5
0
 def final_report():
     # create a diagnostics plot
     plot_file_name = out_file_name('satellites', '-logscores.pdf')
     log('writing diagnostic plot to %s' % plot_file_name)
     _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics(
         bdb, 'logscore', 'satellites_cc')
     plt.savefig(plot_file_name)
     final_metadata_file = out_file_name('satellites', '-meta.txt')
     report(bdb_file,
            final_metadata_file,
            echo=True,
            plot_file_name=plot_file_name)
Пример #6
0
def doit(files, outfile, model_schedule, n_replications):
    out_dir = os.path.dirname(outfile)
    if out_dir and not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    results = probe_fileset(
        files, "satellites_cc",
        [country_purpose_probes,
         unlikely_periods_probes,
         orbit_type_imputation_probes],
        model_schedule = model_schedule,
        n_replications = n_replications)

    with open(outfile, "w") as f:
        pickle.dump(results, f)
    log("Saved probe results to %s" % outfile)
Пример #7
0
def doit(files, outfile, model_schedule, n_replications):
    out_dir = os.path.dirname(outfile)
    if out_dir and not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    results = probe_fileset(
        files,
        "satellites_cc",
        [country_purpose_probes, unlikely_periods_probes, orbit_type_imputation_probes],
        model_schedule=model_schedule,
        n_replications=n_replications,
    )

    with open(outfile, "w") as f:
        pickle.dump(results, f)
    log("Saved probe results to %s" % outfile)
Пример #8
0
def plot_results(results, outdir="figures", ext=".png"):
    """Plot the aggregate results of probing.

    `results` is a list of pairs giving probe conditions and
    aggregated probe results.

    `outdir` is the name of a directory to which to write the visualizations.
    Default: "figures".

    `ext` is the file extension for visualizations, which determines
    the image format used.  Default ".png".

    Each probe condition is expected to be a 3-tuple: probe name,
    model count, analysis iteration count.  Each result is expected to
    be a tagged aggregate (see aggregation.py).

    Each numerical probe produces one plot, named after the probe.
    The plot facets over the model count, displays the iteration count
    on the x-axis, and a violin plot of the results on the y axis.

    All boolean probes are aggregated into one plot named
    "boolean-probes", whose y axis is the frequency of a "True"
    result.  Each probe is a line giving the relationship of the
    frequency to the number of analysis iterations.

    """
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    replications = num_replications(results)
    probes = sorted(
        set((pname, ptype) for ((pname, _, _), (ptype, _)) in results))
    for probe, ptype in probes:
        if not ptype == 'num': continue
        grid = plot_results_numerical(results, probe)
        grid.fig.suptitle(probe + ", %d replications" % replications)
        # XXX Actually shell quote the probe name
        figname = string.replace(probe, " ", "-").replace("/", "") + ext
        savepath = os.path.join(outdir, figname)
        grid.savefig(savepath)
        plt.close(grid.fig)
        log("Probe '%s' results saved to %s" % (probe, savepath))
    grid = plot_results_boolean(results)
    grid.fig.suptitle("Boolean probes, %d replications" % replications)
    figname = "boolean-probes" + ext
    savepath = os.path.join(outdir, figname)
    grid.savefig(savepath)
    plt.close(grid.fig)
    log("Boolean probe results saved to %s" % (savepath, ))
Пример #9
0
def plot_results(results, outdir="figures", ext=".png"):
    """Plot the aggregate results of probing.

    `results` is a list of pairs giving probe conditions and
    aggregated probe results.

    `outdir` is the name of a directory to which to write the visualizations.
    Default: "figures".

    `ext` is the file extension for visualizations, which determines
    the image format used.  Default ".png".

    Each probe condition is expected to be a 3-tuple: probe name,
    model count, analysis iteration count.  Each result is expected to
    be a tagged aggregate (see aggregation.py).

    Each numerical probe produces one plot, named after the probe.
    The plot facets over the model count, displays the iteration count
    on the x-axis, and a violin plot of the results on the y axis.

    All boolean probes are aggregated into one plot named
    "boolean-probes", whose y axis is the frequency of a "True"
    result.  Each probe is a line giving the relationship of the
    frequency to the number of analysis iterations.

    """
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    replications = num_replications(results)
    probes = sorted(set((pname, ptype)
                        for ((pname, _, _), (ptype, _)) in results))
    for probe, ptype in probes:
        if not ptype == 'num': continue
        grid = plot_results_numerical(results, probe)
        grid.fig.suptitle(probe + ", %d replications" % replications)
        # XXX Actually shell quote the probe name
        figname = string.replace(probe, " ", "-").replace("/", "") + ext
        savepath = os.path.join(outdir, figname)
        grid.savefig(savepath)
        plt.close(grid.fig)
        log("Probe '%s' results saved to %s" % (probe, savepath))
    grid = plot_results_boolean(results)
    grid.fig.suptitle("Boolean probes, %d replications" % replications)
    figname = "boolean-probes" + ext
    savepath = os.path.join(outdir, figname)
    grid.savefig(savepath)
    plt.close(grid.fig)
    log("Boolean probe results saved to %s" % (savepath,))
Пример #10
0
 def execute(bql):
     log("executing %s" % bql)
     bdb.execute(bql)
Пример #11
0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed):
    then = time.time()

    timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d')
    user = subprocess.check_output(["whoami"]).strip()
    host = subprocess.check_output(["hostname"]).strip()
    filestamp = '-' + timestamp + '-' + user

    def out_file_name(base, ext):
        return out_dir + '/' + base + filestamp + ext

    csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv')
    bdb_file = out_file_name('satellites', '.bdb')

    # so we can build bdb models
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(bdb_file):
        print 'Error: File', bdb_file, 'already exists. Please remove it.'
        sys.exit(1)

    # create database mapped to filesystem
    log('opening bdb on disk: %s' % bdb_file)
    bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False)

    def execute(bql):
        log("executing %s" % bql)
        bdb.execute(bql)

    # read csv into table
    log('reading data from %s' % csv_file)
    bayeslite.bayesdb_read_csv_file(bdb,
                                    'satellites',
                                    csv_file,
                                    header=True,
                                    create=True,
                                    ifnotexists=True)

    # Add a "not applicable" orbit sub-type
    log('adding "not applicable" orbit sub-type')
    bdb.sql_execute('''UPDATE satellites
        SET type_of_orbit = 'N/A'
        WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO')
          AND type_of_orbit = 'NaN'
    ''')

    # nullify "NaN"
    log('nullifying NaN')
    bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN')

    # register crosscat metamodel
    cc = ccme.MultiprocessingEngine(seed=seed)
    ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, ccmm)

    # create the crosscat generator using
    execute('''
        CREATE GENERATOR satellites_cc FOR satellites USING crosscat (
            GUESS(*),
            name IGNORE,
            Country_of_Operator CATEGORICAL,
            Operator_Owner CATEGORICAL,
            Users CATEGORICAL,
            Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL,
            Type_of_Orbit CATEGORICAL,
            Perigee_km NUMERICAL,
            Apogee_km NUMERICAL,
            Eccentricity NUMERICAL,
            Period_minutes NUMERICAL,
            Launch_Mass_kg NUMERICAL,
            Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL,
            Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL,
            Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL
        )
    ''')

    execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models, ))

    cur_iter_ct = 0

    def snapshot():
        log('vacuuming')
        bdb.sql_execute('vacuum')
        cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
        save_file_name = out_file_name('satellites', cur_infix + '.bdb')
        meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
        log('recording snapshot ' + save_file_name)
        os.system("cp %s %s" % (bdb_file, save_file_name))
        report(save_file_name, meta_file_name)

    def record_metadata(f,
                        saved_file_name,
                        sha_sum,
                        total_time,
                        plot_file_name=None):
        f.write("DB file " + saved_file_name + "\n")
        f.write(sha_sum)
        f.write("built from " + csv_file + "\n")
        f.write("by %s@%s\n" % (user, host))
        f.write("at seed %s\n" % seed)
        f.write("in %3.2f seconds\n" % total_time)
        f.write("with %s models analyzed for %s iterations\n" %
                (num_models, num_iters))
        f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" %
                (bayeslite.__version__, crosscat.__version__,
                 bdbcontrib.__version__))
        if plot_file_name is not None:
            f.write("diagnostics recorded to %s\n" % plot_file_name)
        f.flush()

    def report(saved_file_name,
               metadata_file,
               echo=False,
               plot_file_name=None):
        sha256 = hashlib.sha256()
        with open(saved_file_name, 'rb') as fd:
            for chunk in iter(lambda: fd.read(65536), ''):
                sha256.update(chunk)
        sha_sum = sha256.hexdigest() + '\n'
        total_time = time.time() - then
        with open(metadata_file, 'w') as fd:
            record_metadata(fd, saved_file_name, sha_sum, total_time,
                            plot_file_name)
            fd.write('using script ')
            fd.write('-' * 57)
            fd.write('\n')
            fd.flush()
            os.system("cat %s >> %s" % (__file__, metadata_file))

        if echo:
            record_metadata(sys.stdout, saved_file_name, sha_sum, total_time,
                            plot_file_name)

    def final_report():
        # create a diagnostics plot
        plot_file_name = out_file_name('satellites', '-logscores.pdf')
        log('writing diagnostic plot to %s' % plot_file_name)
        _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics(
            bdb, 'logscore', 'satellites_cc')
        plt.savefig(plot_file_name)
        final_metadata_file = out_file_name('satellites', '-meta.txt')
        report(bdb_file,
               final_metadata_file,
               echo=True,
               plot_file_name=plot_file_name)

    snapshot()
    while cur_iter_ct < num_iters:
        execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' %
                checkpoint_freq)
        cur_iter_ct += checkpoint_freq
        snapshot()

    final_report()

    log('closing bdb %s' % bdb_file)
    bdb.close()
    os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" %
              (out_dir, filestamp))
Пример #12
0
 def execute(bql):
     log("executing %s" % bql)
     bdb.execute(bql)
Пример #13
0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed):
    then = time.time()

    timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d')
    user = subprocess.check_output(["whoami"]).strip()
    host = subprocess.check_output(["hostname"]).strip()
    filestamp = '-' + timestamp + '-' + user
    def out_file_name(base, ext):
        return out_dir + '/' + base + filestamp + ext

    csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv')
    bdb_file = out_file_name('satellites', '.bdb')

    # so we can build bdb models
    os.environ['BAYESDB_WIZARD_MODE']='1'

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(bdb_file):
        print 'Error: File', bdb_file, 'already exists. Please remove it.'
        sys.exit(1)

    # create database mapped to filesystem
    log('opening bdb on disk: %s' % bdb_file)
    bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False)

    def execute(bql):
        log("executing %s" % bql)
        bdb.execute(bql)

    # read csv into table
    log('reading data from %s' % csv_file)
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file,
            header=True, create=True, ifnotexists=True)

    # Add a "not applicable" orbit sub-type
    log('adding "not applicable" orbit sub-type')
    bdb.sql_execute('''UPDATE satellites
        SET type_of_orbit = 'N/A'
        WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO')
          AND type_of_orbit = 'NaN'
    ''')

    # nullify "NaN"
    log('nullifying NaN')
    bdbcontrib.nullify(bdb, 'satellites', 'NaN')

    # register crosscat metamodel
    cc = ccme.MultiprocessingEngine(seed=seed)
    ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, ccmm)

    # create the crosscat generator using
    execute('''
        CREATE GENERATOR satellites_cc FOR satellites USING crosscat (
            GUESS(*),
            name IGNORE,
            Country_of_Operator CATEGORICAL,
            Operator_Owner CATEGORICAL,
            Users CATEGORICAL,
            Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL,
            Type_of_Orbit CATEGORICAL,
            Perigee_km NUMERICAL,
            Apogee_km NUMERICAL,
            Eccentricity NUMERICAL,
            Period_minutes NUMERICAL,
            Launch_Mass_kg NUMERICAL,
            Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL,
            Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL,
            Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL
        )
    ''')

    execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models,))

    cur_iter_ct = 0

    def snapshot():
        log('vacuuming')
        bdb.sql_execute('vacuum')
        cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
        save_file_name = out_file_name('satellites', cur_infix + '.bdb')
        meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
        log('recording snapshot ' + save_file_name)
        os.system("cp %s %s" % (bdb_file, save_file_name))
        report(save_file_name, meta_file_name)

    def record_metadata(f, saved_file_name, sha_sum, total_time,
                        plot_file_name=None):
        f.write("DB file " + saved_file_name + "\n")
        f.write(sha_sum)
        f.write("built from " + csv_file + "\n")
        f.write("by %s@%s\n" % (user, host))
        f.write("at seed %s\n" % seed)
        f.write("in %3.2f seconds\n" % total_time)
        f.write("with %s models analyzed for %s iterations\n"
                % (num_models, num_iters))
        f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n"
                % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__))
        if plot_file_name is not None:
            f.write("diagnostics recorded to %s\n" % plot_file_name)
        f.flush()

    def report(saved_file_name, metadata_file, echo=False, plot_file_name=None):
        sha256 = hashlib.sha256()
        with open(saved_file_name, 'rb') as fd:
            for chunk in iter(lambda: fd.read(65536), ''):
                sha256.update(chunk)
        sha_sum = sha256.hexdigest() + '\n'
        total_time = time.time() - then
        with open(metadata_file, 'w') as fd:
            record_metadata(fd, saved_file_name,
                            sha_sum, total_time, plot_file_name)
            fd.write('using script ')
            fd.write('-' * 57)
            fd.write('\n')
            fd.flush()
            os.system("cat %s >> %s" % (__file__, metadata_file))

        if echo:
            record_metadata(sys.stdout, saved_file_name,
                            sha_sum, total_time, plot_file_name)

    def final_report():
        # create a diagnostics plot
        plot_file_name = out_file_name('satellites', '-logscores.pdf')
        log('writing diagnostic plot to %s' % plot_file_name)
        _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore',
                                                          'satellites_cc')
        plt.savefig(plot_file_name)
        final_metadata_file = out_file_name('satellites', '-meta.txt')
        report(bdb_file, final_metadata_file,
               echo=True, plot_file_name=plot_file_name)

    snapshot()
    while cur_iter_ct < num_iters:
        execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq)
        cur_iter_ct += checkpoint_freq
        snapshot()

    final_report()

    log('closing bdb %s' % bdb_file)
    bdb.close()
    os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))