示例#1
0
def main(argv):
    del argv

    if not gfile.Exists(FLAGS.save_dir):
        try:
            gfile.MkDir(FLAGS.save_dir)
        except:
            print(('WARNING: error creating save directory, '))

    save_dir = os.path.join(FLAGS.save_dir,
                            FLAGS.dataset + '_' + FLAGS.sampling_method)

    if FLAGS.do_save == "True":
        if not gfile.Exists(save_dir):
            try:
                gfile.MkDir(save_dir)
            except:
                print(('WARNING: error creating save directory, '
                       'directory most likely already created.'))

        # Set up logging
        filename = os.path.join(
            save_dir,
            "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt")
        sys.stdout = utils.Logger(filename)

    X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset)  #load dataset!
    starting_seed = FLAGS.seed

    all_results = {}

    for seed in range(starting_seed, starting_seed + FLAGS.trials):
        sampler = get_AL_sampler(FLAGS.sampling_method)  #load sampler!
        score_model = utils.get_model(FLAGS.score_method,
                                      seed)  #load score model!
        if (FLAGS.select_method == "None" or  #load select model!
                FLAGS.select_method == FLAGS.score_method):
            select_model = None
        else:
            select_model = utils.get_model(FLAGS.select_method, seed)

        results, sampler_state = \
        generate_one_curve(X=X,
                           y=y,
                           sampler=sampler,
                           score_model=score_model,
                           seed=seed,
                           warmstart_size=FLAGS.warmstart_size,
                           batch_size=FLAGS.batch_size,
                           select_model=select_model,
                           max_points=FLAGS.max_dataset_size)

        key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method,
               FLAGS.select_method, FLAGS.warmstart_size, FLAGS.batch_size,
               seed)

        #sampler_output = sampler_state.to_dict()
        #results['sampler_output'] = sampler_output
        results['sampler_output'] = None
        all_results[key] = results

    fields = [
        'dataset', 'sampling_methods', 'score_method', 'select_method',
        'warmstart size', 'batch size', 'seed'
    ]
    all_results['tuple_keys'] = fields

    if FLAGS.do_save == "True":
        filename = ("results_score_" + FLAGS.score_method + "_select_" +
                    FLAGS.select_method)
        existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl"))
        filename = os.path.join(
            save_dir,
            filename + "_" + str(1000 + len(existing_files))[1:] + ".pkl")
        pickle.dump(all_results, gfile.GFile(filename, "w"))
        sys.stdout.flush_file()
def main(args):

    # make the export folder structure
    # this is made here because the Logger uses the filename
    if args.do_save:
        # make a base save directory
        utils.make_dir(args.save_dir)

        # make a directory in the base save directory with for the specific
        # method.
        save_subdir = os.path.join(args.save_dir,
                                   args.dataset + "_" + args.sampling_method)
        utils.make_dir(save_subdir)

        filename = os.path.join(
            save_subdir,
            "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt")
        sys.stdout = utils.Logger(filename)

    # confusion argument can have multiple values
    confusions = [float(t) for t in args.confusions.split(" ")]
    mixtures = [float(t) for t in args.active_sampling_percentage.split(" ")]
    max_dataset_size = None if args.max_dataset_size == 0 else args.max_dataset_size
    starting_seed = args.seed

    # get the dataset from file based on the data directory and dataset name
    X, y = utils.get_mldata(args.data_dir, args.dataset)

    # object to store the results in
    all_results = {}

    # percentage of labels to randomize
    for c in confusions:

        # Mixture weights on active sampling."
        for m in mixtures:

            # the number of curves created during multiple trials
            for seed in range(starting_seed, starting_seed + args.trials):

                # get the sampler based on the name
                # returns a python object
                # also named: query strategy
                sampler = get_AL_sampler(args.sampling_method)

                # get the model
                score_model = utils.get_model(args.score_method, seed)

                #
                if (args.select_method == "None"
                        or args.select_method == args.score_method):
                    select_model = None
                else:
                    select_model = utils.get_model(args.select_method, seed)

                # create the learning curve
                results, sampler_state = generate_one_curve(
                    X,
                    y,
                    sampler,
                    score_model,
                    seed,
                    args.warmstart_size,
                    args.batch_size,
                    select_model,
                    confusion=c,
                    active_p=m,
                    max_points=max_dataset_size,
                    standardize_data=args.standardize_data,
                    norm_data=args.normalize_data,
                    train_horizon=args.train_horizon)
                key = (args.dataset, args.sampling_method, args.score_method,
                       args.select_method, m, args.warmstart_size,
                       args.batch_size, c, args.standardize_data,
                       args.normalize_data, seed)
                sampler_output = sampler_state.to_dict()
                results["sampler_output"] = sampler_output
                all_results[key] = results

    # Not sure why this is done in a qay like this.
    fields = [
        "dataset", "sampler", "score_method", "select_method",
        "active percentage", "warmstart size", "batch size", "confusion",
        "standardize", "normalize", "seed"
    ]
    all_results["tuple_keys"] = fields

    # write the results to a file
    if args.do_save:

        # format the filename
        filename = "results_score_{}_select_{}_norm_{}_stand_{}".format(
            args.score_method, args.select_method, args.normalize_data,
            args.standardize_data)

        existing_files = gfile.Glob(
            os.path.join(save_subdir, "{}*.pkl".format(filename)))
        filepath = os.path.join(
            save_subdir, "{}_{}.pkl".format(filename,
                                            1000 + len(existing_files))[1:])

        # dump the dict to a pickle file
        pickle.dump(all_results, gfile.GFile(filepath, "w"))

        # flush stfout
        sys.stdout.flush_file()
示例#3
0
def main(argv):
  del argv

  if not gfile.Exists(FLAGS.save_dir):
    try:
      gfile.MkDir(FLAGS.save_dir)
    except:
      print(('WARNING: error creating save directory, '
             'directory most likely already created.'))

  save_dir = os.path.join(
      FLAGS.save_dir,
      FLAGS.dataset + "_" + FLAGS.sampling_method)
  do_save = FLAGS.do_save == "True"

  if do_save:
    if not gfile.Exists(save_dir):
      try:
        gfile.MkDir(save_dir)
      except:
        print(('WARNING: error creating save directory, '
               'directory most likely already created.'))
    # Set up logging
    filename = os.path.join(
        save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt")
    sys.stdout = utils.Logger(filename)

  confusions = [float(t) for t in FLAGS.confusions.split(" ")]
  mixtures = [float(t) for t in FLAGS.active_sampling_percentage.split(" ")]
  all_results = {}
  max_dataset_size = None if FLAGS.max_dataset_size == "0" else int(
      FLAGS.max_dataset_size)
  normalize_data = FLAGS.normalize_data == "True"
  standardize_data = FLAGS.standardize_data == "True"
  X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset)
  starting_seed = FLAGS.seed

  for c in confusions:
    for m in mixtures:
      for seed in range(starting_seed, starting_seed + FLAGS.trials):
        sampler = get_AL_sampler(FLAGS.sampling_method)
        score_model = utils.get_model(FLAGS.score_method, seed)
        if (FLAGS.select_method == "None" or
            FLAGS.select_method == FLAGS.score_method):
          select_model = None
        else:
          select_model = utils.get_model(FLAGS.select_method, seed)
        results, sampler_state = generate_one_curve(
            X, y, sampler, score_model, seed, FLAGS.warmstart_size,
            FLAGS.batch_size, select_model, c, m, max_dataset_size,
            standardize_data, normalize_data, FLAGS.train_horizon)
        key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method,
               FLAGS.select_method, m, FLAGS.warmstart_size, FLAGS.batch_size,
               c, standardize_data, normalize_data, seed)
        sampler_output = sampler_state.to_dict()
        results["sampler_output"] = sampler_output
        all_results[key] = results
  fields = [
      "dataset", "sampler", "score_method", "select_method",
      "active percentage", "warmstart size", "batch size", "confusion",
      "standardize", "normalize", "seed"
  ]
  all_results["tuple_keys"] = fields

  if do_save:
    filename = ("results_score_" + FLAGS.score_method +
                "_select_" + FLAGS.select_method +
                "_norm_" + str(normalize_data) +
                "_stand_" + str(standardize_data))
    existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl"))
    filename = os.path.join(save_dir,
                            filename + "_" + str(1000+len(existing_files))[1:] + ".pkl")
    pickle.dump(all_results, gfile.GFile(filename, "w"))
    sys.stdout.flush_file()