def main():
    # enable TensorFlow logging
    tf.logging.set_verbosity(tf.logging.INFO)
    tf_logging._get_logger().propagate = False  # fix double messages

    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--model-type',
                        type=str,
                        default='multiclass',
                        help='Model type')
    parser.add_argument('-d',
                        '--model-dir',
                        type=str,
                        help='Model\'s directory')
    parser.add_argument('-c', '--config', type=str, help='Path to the config')

    args = parser.parse_args()

    model_type = args.model_type
    model_dir = args.model_dir
    config_path = args.config

    # default values for local testing
    if not model_dir and not config_path:
        model_dir = 'training/%s/local/test1' % model_type
        config_path = 'configs/%s/default.yaml' % model_type

    # train and evaluate the model
    train_model(model_type, model_dir, config_path)
Пример #2
0
def set_logger(file_name=None):
    """
    Writing logs to a file if file_name,
    the handler needs to be closed by `close_logger()` after use.

    :param file_name:
    :return:
    """
    # pylint: disable=no-name-in-module
    from tensorflow.python.platform.tf_logging import _get_logger

    logger = _get_logger()
    tf.logging.set_verbosity(tf.logging.INFO)
    logger.handlers = []

    # adding console output
    f = log.Formatter(CONSOLE_LOG_FORMAT)
    std_handler = log.StreamHandler(sys.stdout)
    std_handler.setFormatter(f)
    logger.addHandler(std_handler)

    if file_name:
        # adding file output
        f = log.Formatter(FILE_LOG_FORMAT)
        file_handler = log.FileHandler(file_name)
        file_handler.setFormatter(f)
        logger.addHandler(file_handler)
def main():
    # enable TensorFlow logging
    tf.logging.set_verbosity(tf.logging.INFO)
    tf_logging._get_logger().propagate = False  # fix double messages

    # directory with the exported model
    saved_model_dir = root_dir('export/final_model')

    # image size that the model accepts
    image_size = 48

    # load the images from the dataset
    _, imgs = load_data()

    # get test images and crop them to the right size
    imgs = get_test_dataset(imgs, image_size)

    # load the model
    predict_fn = tf.contrib.predictor.from_saved_model(saved_model_dir)

    # get predictions
    res = predict_fn({'image': imgs})

    # print predicted spikes
    pprint(res['spikes'])
def main():
    # enable TensorFlow logging
    tf.logging.set_verbosity(tf.logging.INFO)
    tf_logging._get_logger().propagate = False  # fix double messages

    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--model-type',
                        type=str,
                        default='multiclass',
                        help='Model type')
    parser.add_argument('-g',
                        '--experiment-group',
                        type=str,
                        default='local',
                        help='Experiment group')
    parser.add_argument('-n',
                        '--experiment-name',
                        type=str,
                        default='test1',
                        help='Experiment name')

    args = parser.parse_args()

    tune_hyperparameters(args.model_type, args.experiment_group,
                         args.experiment_name)
Пример #5
0
def main(_):
    import logging
    import sys
    from tensorflow.python.platform import tf_logging

    logging.basicConfig(level=logging.DEBUG,
                        stream=sys.stderr,
                        format='%(levelname)s '
                        '%(asctime)s.%(msecs)06d: '
                        '%(filename)s: '
                        '%(lineno)d '
                        '%(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

    tf_logger = tf_logging._get_logger()
    tf_logger.propagate = False

    print_flags()

    if not tf.gfile.Exists(FLAGS.eval_log_dir):
        tf.gfile.MakeDirs(FLAGS.eval_log_dir)

    dataset = datasets.create_dataset()
    model = models.create_model(num_char_classes=dataset.num_char_classes,
                                max_seq_len=dataset.max_seq_len,
                                null_code=dataset.null_code)

    data = data_loader.get_data(dataset)
    endpoints = model.create_base(data.images, is_training=False)
    eval_ops, prediction, label = model.create_eval_ops(data, endpoints)

    tf.train.get_or_create_global_step()

    session_config = tf.ConfigProto(allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True

    if FLAGS.eval_type == 'once':
        slim.evaluation.evaluate_once(master=FLAGS.master,
                                      checkpoint_path=FLAGS.ckpt_path,
                                      logdir=FLAGS.eval_log_dir,
                                      num_evals=FLAGS.num_batches,
                                      eval_op=eval_ops,
                                      session_config=session_config)
    elif FLAGS.eval_type == 'loop':
        slim.evaluation.evaluation_loop(
            master=FLAGS.master,
            checkpoint_dir=FLAGS.train_log_dir,
            logdir=FLAGS.eval_log_dir,
            eval_op=eval_ops,
            num_evals=FLAGS.num_batches,
            eval_interval_secs=FLAGS.eval_interval_secs,
            max_number_of_evaluations=FLAGS.number_of_steps,
            timeout=2000,
            session_config=session_config)
    else:
        pass
Пример #6
0
    def wrap_tf_logger(self, task_name):
        tf_logger = tf_logging._get_logger()
        formatter = logging.Formatter('%(asctime)s - %(message)s')

        fh = logging.FileHandler(self.loc(task_name))
        ch = logging.StreamHandler()
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)

        tf_logger.handlers = [fh, ch]
        tf_logger.addFilter(TFFilter())
        tf_logger.propagate = False
def main():
    # enable TensorFlow logging
    tf.logging.set_verbosity(tf.logging.INFO)
    tf_logging._get_logger().propagate = False  # fix double messages

    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--model-type', type=str, default='multiclass', help='Model type')
    parser.add_argument('-d', '--model-dir', type=str, required=True, help='Model\'s directory')
    parser.add_argument('-n', '--submission-num', type=int, required=True, help='Submission number')

    args = parser.parse_args()

    # generate submission and copy the model
    generate_submission(args.model_type, args.model_dir, args.submission_num)
Пример #8
0
def main(_):
    print_flags()
    import logging
    import sys
    from tensorflow.python.platform import tf_logging

    logging.basicConfig(level=logging.DEBUG,
                        stream=sys.stderr,
                        format='%(levelname)s '
                        '%(asctime)s.%(msecs)06d: '
                        '%(filename)s: '
                        '%(lineno)d '
                        '%(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    tf_logger = tf_logging._get_logger()
    tf_logger.propagate = False

    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpus

    prepare_training_dir()
    dataset = datasets.create_dataset()

    model = models.create_model(
        num_char_classes=dataset.
        num_char_classes,  # represents `num_labels + 1` classes
        max_seq_len=dataset.max_seq_len,
        null_code=dataset.null_code)
    hparams = get_training_hparams()

    # If ps_tasks is zero, the local device is used. When using multiple
    # (non-local) replicas, the ReplicaDeviceSetter distributes the variables
    # across the different devices.

    device_setter = tf.train.replica_device_setter(FLAGS.ps_tasks,
                                                   merge_devices=True)
    with tf.device(device_setter):
        data = data_loader.get_data(dataset)

        endpoints = model.create_base(data.images, is_training=True)

        total_loss = model.create_loss(data, endpoints)
        init_fn = model.create_init_fn(FLAGS.checkpoint)

        # print(tf.trainable_variables('CRNN'))
        if FLAGS.show_graph_stats:
            logging.info('Total number of weights in the graph: %s',
                         profile_graph())
        train(total_loss, init_fn, hparams)
Пример #9
0
def initLogging(verbosity=0):
    """Setup logging with a given verbosity level"""
    # tensorflow logging is a mess, disable the default handler or it will dupe every log
    from tensorflow.python.platform import tf_logging

    tf_logger = tf_logging._get_logger()
    tf_logger.handlers = []
    # import logging.config
    # logging.config.fileConfig('logging_config.ini', disable_existing_loggers=False)
    logging.basicConfig()
    if verbosity == 0:
        logging.root.setLevel(logging.WARN)
    if verbosity == 1:
        logging.root.setLevel(logging.INFO)
    if verbosity > 1:
        logging.root.setLevel(logging.DEBUG)
Пример #10
0
def close_logger():
    """
    Close file-based outputs

    :return:
    """
    # pylint: disable=no-name-in-module
    from tensorflow.python.platform.tf_logging import _get_logger

    logger = _get_logger()
    for handler in reversed(logger.handlers):
        try:
            handler.flush()
            handler.close()
            logger.removeHandler(handler)
        except (OSError, ValueError):
            pass
Пример #11
0
def setup_tensorflow(device: Union[str, int, Sequence[int], Sequence[str]],
                     allow_growth: bool):
    """Setup tensorflow session according to gpu configuration.

    Args:
        device (Union[str, int, Sequence[int], Sequence[str]]): GPU or list of GPUs to run on
        allow_growth (bool): Whether to capture all memory on gpu or grow as necessary

    Returns:
        sess (tf.Session): Tensorflow Session object as the default session
    """
    if isinstance(device, int):
        device = str(device)
    elif isinstance(device, list):
        device = ', '.join([str(d) for d in device])
    elif not isinstance(device, str):
        raise ValueError(
            "Unrecognized device type. Expected int, str, or list. "
            "Received {}.".format(type(device)))

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = device
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"  # disable tensorflow info logging
    tf.logging.set_verbosity(tf.logging.WARN)

    from tensorflow.python.platform import tf_logging
    try:
        # tensorflow == 1.13
        tf_logging.get_logger().propagate = False
    except AttributeError:
        # tensorflow <= 1.12
        tf_logging._get_logger().propagate = False

    gpu_options = tf.GPUOptions(allow_growth=allow_growth)
    conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
    sess = tf.get_default_session()
    if sess is None:
        sess = tf.Session(config=conf)
        sess.__enter__()  # type: ignore

    np.set_printoptions(suppress=True)

    return sess
Пример #12
0
def main():
    # enable TensorFlow logging
    tf.logging.set_verbosity(tf.logging.INFO)
    tf_logging._get_logger().propagate = False  # fix double messages

    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--model-type', type=str, default='multiclass', help='Model type')
    parser.add_argument('-d', '--model-dir', type=str, required=True, help='Model\'s directory')
    parser.add_argument('-s', '--checkpoint-step', type=int, help='Export the model from the checkpoint taken on '
                                                                  'the specified global step.')

    args = parser.parse_args()

    # load the model config
    config = load_model_config(args.model_dir)

    # create the model builder
    model_builder = create_builder(args.model_type, config)

    # train and evaluate the model
    export(model_builder, args.model_dir, args.checkpoint_step)
Пример #13
0
def get_logger(logger_name=None, log_file: Path = None, level=logging.DEBUG):
    # "log/data-pipe-{}.log".format(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
    if logger_name is None:
        logger = tf_logging._get_logger()
    else:
        logger = logging.getLogger(logger_name)

    if not logger.hasHandlers():
        formatter = logging.Formatter(LOG_FORMAT)

        logger.setLevel(level)

        if log_file is not None:
            log_file.parent.mkdir(parents=True, exist_ok=True)
            fileHandler = logging.FileHandler(log_file, mode="w")
            fileHandler.setFormatter(formatter)
            logger.addHandler(fileHandler)

        streamHandler = logging.StreamHandler()
        streamHandler.setFormatter(formatter)
        logger.addHandler(streamHandler)

    return logger
Пример #14
0
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import, division, print_function

import argparse
import json
import logging as _logging
import numpy as np
import os
import sys as _sys
import tensorflow as tf
from tensorflow.python.platform import tf_logging

tf.logging.set_verbosity(tf.logging.DEBUG)
_handler = _logging.StreamHandler(_sys.stdout)
tf_logger = tf_logging._get_logger()
tf_logger.handlers = [_handler]


def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
Пример #15
0
def main():
    parser = argparse.ArgumentParser(
        description='Petastorm/Sagemaker/Tensorflow MNIST Example')

    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm-model-dir',
                        type=str,
                        default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train',
                        type=str,
                        default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--hosts',
                        type=list,
                        default=json.loads(os.environ.get('SM_HOSTS')))
    parser.add_argument('--current-host',
                        type=str,
                        default=os.environ.get('SM_CURRENT_HOST'))

    parser.add_argument('--dataset-url',
                        type=str,
                        metavar='S',
                        help='S3:// URL to the MNIST petastorm dataset')

    parser.add_argument('--training_steps', type=int, default=300)
    parser.add_argument('--evaluation_steps', type=int, default=10)
    parser.add_argument('--log_step_count_steps', type=int, default=100)
    parser.add_argument('--save_checkpoints_steps', type=int, default=500)
    parser.add_argument('--save_summary_steps', type=int, default=50)
    parser.add_argument('--throttle_secs', type=int, default=10)

    parser.add_argument('--prefetch_size', type=int, default=16)
    parser.add_argument('--num_parallel_batches', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=256)

    args = parser.parse_args()

    tf.logging.set_verbosity(tf.logging.DEBUG)

    # TF 1.13 and 1.14 handle logging a bit different, so wrapping the logging setup in a try/except block
    try:
        tf_logger = tf_logging._get_logger()
        handler = tf_logger.handlers[0]
        handler.setFormatter(
            _logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    except:
        pass

    # In 1.14, a multi-worker synchronous training can be achieved using CollectiveAllReduceStrategy per
    # See https://github.com/tensorflow/tensorflow/issues/23664
    # Without providing train_distribute, I believe asynchronous training is done
    run_config = tf.estimator.RunConfig(
        save_checkpoints_steps=args.save_checkpoints_steps,
        log_step_count_steps=args.log_step_count_steps,
        save_summary_steps=args.save_summary_steps,
    )

    model_dir_parent_path = args.model_dir[:-5]
    model_dir_parent = model_dir_parent_path.split("/")[-2]

    print(
        f"Launch tensorboard by running the following in terminal:\n" +
        "aws s3 sync {model_dir_parent_path} ~/Downloads/{model_dir_parent} && "
        + "tensorboard --logdir=~/Downloads/{model_dir_parent}")

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=args.model_dir,
                                       params={"batch_size": args.batch_size},
                                       config=run_config)

    workers = json.loads(os.environ['SM_HOSTS'])
    worker_index = workers.index(os.environ['SM_CURRENT_HOST'])
    nr_workers = len(workers)
    print(
        f"Inside training script on worker with (0-based) index {worker_index} out of {nr_workers - 1}."
    )

    with make_reader(os.path.join(args.dataset_url, 'train'),
                     num_epochs=None,
                     cur_shard=worker_index,
                     shard_count=nr_workers,
                     workers_count=nr_workers) as train_reader:
        with make_reader(os.path.join(args.dataset_url, 'test'),
                         num_epochs=None,
                         cur_shard=0,
                         shard_count=1) as eval_reader:

            train_fn = lambda: _input_fn(reader=train_reader,
                                         batch_size=args.batch_size,
                                         num_parallel_batches=args.
                                         num_parallel_batches)

            eval_fn = lambda: _input_fn(reader=eval_reader,
                                        batch_size=args.batch_size,
                                        num_parallel_batches=args.
                                        num_parallel_batches)

            train_spec = tf.estimator.TrainSpec(input_fn=train_fn,
                                                max_steps=args.training_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_fn,
                                              throttle_secs=args.throttle_secs,
                                              steps=args.evaluation_steps)

            tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Пример #16
0
def _main():
    # commandline argument parser
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description='Random Forest Classifier based on Tensorflow (TM)',
        epilog=textwrap.dedent('''\
            Notes:
              You *MUST* specify at least one of the -r, -t or -x
              options

            Examples:

              - Simple training and testing:
                %(prog)s -t cat1.fits -x cat2.fitsi -c "Class"

              - Train the classifier and save the trained model:
                %(prog)s --train traincat.fits --model-dir ./mymodel/

              - Load saved model and run it on a catalogl:
                %(prog)s --model-dir ./mymodel --run mycat.fits

              - Train the classifier with a timeout of 1 day and 6 hours
                %(prog)s --train traincat.votable --train_timeout 1d3h
                %(prog)s --train traincat.votable --train_timeout 1.25d
                %(prog)s --train traincat.votable --train_timeout 30h

              - Train the classifier with a loss treshold of 0.001
                %(prog)s --train train.csv --loss-treshold 0.001

            For more info import the classtf module in python and run
            >>> help(classtf)


            '''))

    parser.add_argument('-r', '--run', type=str, action='store',
                        metavar='FILE', dest='run_filename',
                        help="If specified, the classifier is run using\
                              the dataset %(metavar)s as input")

    parser.add_argument('-t', '--train', type=str, action='store',
                        metavar='FILE', dest='train_filename',
                        help="If specified, the classifier is trained using\
                              the dataset %(metavar)s")

    parser.add_argument('-x', '--test', type=str, action='store',
                        metavar='FILE', dest='test_filename',
                        help="If specified, the classifier is tested using\
                              the dataset %(metavar)s")

    parser.add_argument('-f', '--feature-importance', action='store_true',
                        dest='do_feature_importance',
                        help="If specified, the importance of each feature\
                              is computed. Can be used only if both --train\
                              and --test options are specified")

    parser.add_argument('--loss-treshold', type=float,  action='store',
                        metavar='VALUE', dest='loss_treshold', default=0.001,
                        help="If specified, the training will stop when the\
                              loss changes between two cycles becomes smaller\
                              than %(metavar)s (or when the training timeout\
                              expires). This option has effect only when the\
                              option --train is specified and is ignored\
                              otherwise. If not specified, the default value\
                              of 0.001 is used.")

    parser.add_argument('--train-timeout', type=str, action='store',
                        metavar='TIME_INTERVAL', dest="train_timeout",
                        help="If specified set the maximum execution time\
                              for the training process. %(metavar)s must be\
                              a string representing a time interval. Allowed\
                              units are y (years), d (days), h (hours),\
                              m (minutes), s (seconds) [i.e. 1y2d13h20m13.3s\
                              If not specified, no timeout is applied")

    parser.add_argument('-c', '--target', type=str, action='store',
                        metavar='TARGET_FEATURE_ID', dest='class_target',
                        default=-1,
                        help="Set the name or column index of feature used as\
                              target class during training and testing. If\
                              not specified, the last column in the dataset\
                              is used as default")

    parser.add_argument('--ignore-features', type=str, action='store',
                        metavar='', dest='skip_list', nargs='+',
                        help="List of features that should be ignored")

    parser.add_argument('-d', '--model-dir', type=str, action='store',
                        metavar='MODEL_DIR', dest='model_dir',
                        help="If specified, the trained model is saved or\
                              restored from %(metavar)s")

    parser.add_argument('-n', '--trees', type=int, action='store',
                        metavar='NUM_OF_TREES', dest='num_trees',
                        default=1000,
                        help="If specified, set the number of generated trees\
                              to %(metavar)s. Otherwisee fallback to the\
                              default value of 1000 trees")

    parser.add_argument('-b', '--batch-size', type=int, action='store',
                        metavar='BATCH_SIZE', dest='batch_size',
                        default=4096,
                        help="If specified, set  the size of the batch to\
                              to %(metavar)s, which is the number of object\
                              used at once during a training/test/run cycle.\
                              The default value is 4096")

    parser.add_argument('--depth', type=int, action='store',
                        metavar='NUM_NODES', dest='max_nodes',
                        default=10000,
                        help="If specified, set  the maximum number of nodes\
                              created by the model to %(metavar)s.\
                              The default value is 10000")

    parser.add_argument('-v', '--version', action='store_true',
                        dest='show_version',
                        help="Print the program version and exit")

    args = parser.parse_args()
    print("")

    if args.show_version:
        print("classtf - random forset classifier")
        print("version {0:d}.{1:d}.{2:d}".format(VERSION_MAJ,
                                                 VERSION_MIN,
                                                 VERSION_REV))
        print("")
        sys.exit(0)

    do_train = _check_file(args.train_filename)
    do_test = _check_file(args.test_filename)
    do_run = _check_file(args.run_filename)

    if args.do_feature_importance and (not do_test or not do_train):
        print(MSG_ERR_FIMPO)
        sys.exit(1)

    if not (do_run or do_test or do_train):
        parser.print_help()
        sys.exit(1)

    # Preliminary sanity checks
    if not args.model_dir:
        # if no model directory is specified hten create just
        # a temporary directoory
        tmp_dir = tempfile.TemporaryDirectory(prefix="rf-model-")
        model_dir = tmp_dir.name
    else:
        model_dir = args.model_dir
        # If we have specified the model directory but we are not
        # performing a training then we want to read the save we saved
        # early. In this case let's check if it really exists
        if not os.path.isdir(model_dir):
            if not args.train_filename:
                # no training...  bailing out
                print(MSG_ERR_DIR_NOT_FOUND.format(model_dir))
                sys.exit(1)
            elif not os.path.exists(model_dir):
                # the directory does not exist, let's try to create it
                try:
                    os.makedirs(model_dir)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        # If somehow the direcotry has been created after the
                        # elif invocation, then just ignore the error, it
                        # exists that's what matters
                        print(MSG_ERR_DIR_PERM.format(model_dir))
                        sys.exit(1)
            else:
                print(MSG_ERR_DIR_NOT_FOUND.format(model_dir))
                sys.exit(1)
                # The path exists but is not a directory... bailing ou

    # NOTE: there is a bug in tensorflow version 1.12.0 that floods the
    #       console with warnings. They do not affect the execution, but are
    #       quite annoying. Let's hijack them to a log file...
    logfile = os.path.join(model_dir, 'log.txt')
    tf_logger = tf_logging._get_logger()
    tf_logger.removeHandler(tf_logger.handlers[0])
    tf_logger.addHandler(logging.FileHandler(logfile))

    train_timeout = timestr2sec(args.train_timeout)

    # Loading actual data
    if args.run_filename:
        x_data, _ = readinput(args.run_filename, None, args.skip_list)

    #
    # Here the actual program starts
    # Let's create a default tensorflow session
    #
    with tf.Session() as sess:
        ap.conf.max_lines = -1
        ap.conf.max_width = -1

        rf = RFModel(args.num_trees,
                     args.max_nodes,
                     tf_session=sess,
                     loss_treshold=args.loss_treshold,
                     batch_size=args.batch_size,
                     train_timeout=train_timeout)

        print("")
        print(MSG_INFO_MODELDIR.format(model_dir))
        print("")
        if do_train:
            print("Reading train dataset...")
            x_train, y_train, x_names, y_names = readinput(
                args.train_filename,
                args.class_target,
                args.skip_list,
                make_shadows=args.do_feature_importance)

            t_start = time.time()
            c_t_start = time.ctime()
            rf.train(x_train, y_train, model_dir)
            c_t_end = time.ctime()
            delta_t = datetime.timedelta(seconds=time.time()-t_start)

            with open(args.train_filename+'-train_log.txt', 'w') as outf:
                outf.write("Training starts at {}\n".format(c_t_start))
                outf.write(LOG_MODEL_INFO.format(rf))
                outf.write("\nFeatures: {}\n".format(x_names))
                outf.write("\nTarget class: {}\n".format(y_names))
                outf.write("Training ends at {}\n".format(c_t_end))
                outf.write("Elapsed time: {}\n".format(delta_t))

            del x_train
            del y_train
            print("")

        if do_test:
            print("Reading test dataset...")
            x_test, y_test, x_names, y_names = readinput(
                args.test_filename,
                args.class_target,
                args.skip_list,
                make_shadows=args.do_feature_importance)

            t_start = time.time()
            c_t_start = time.ctime()
            pred, cm = rf.test(x_test, y_test, model_dir)
            c_t_end = time.ctime()
            delta_t = datetime.timedelta(seconds=time.time()-t_start)

            t = Table.read(args.test_filename)
            newcol_name = 'PRED_CLASS'
            while newcol_name in t.colnames:
                newcol_name = '_'+newcol_name

            # add the predicted classes as last column in the dataset
            newcol = Column(pred, name=newcol_name)
            t.add_column(newcol)

            # save the test output
            catname = os.path.splitext(args.test_filename)[0] + '-test_output.fits'
            t.write(catname, format='fits')
            del t

            print("\nConfusion matrix:")
            print(cm)

            base_scores = score(cm)
            base_f1 = np.mean(base_scores, axis=0)[-1]

            # write some info in the test log
            with open(args.test_filename+'-test_log.txt', 'w') as outf:
                outf.write("Test starts at {}\n".format(c_t_start))
                outf.write(LOG_MODEL_INFO.format(rf))
                outf.write("\nFeatures: {}\n".format(x_names))
                outf.write("\nTarget class: {}\n".format(y_names))
                outf.write('CONFUSION MATRIX\n')
                outf.write(str(cm).replace('[', ' ').replace(']', ''))
                print("\n         precision   recall   f1-score")
                outf.write("\n\n         precision   recall   f1-score\n")
                fmt_str = "class {0:d}:  {1: 8.3f} {2: 8.3f}   {3: 8.3f}"
                for stats in base_scores:
                    print(fmt_str.format(*stats))
                    outf.write(fmt_str.format(*stats)+'\n')
                print("")
                outf.write("Test ends at {}\n".format(c_t_end))
                outf.write("Elapsed time: {}\n".format(delta_t))

            if args.do_feature_importance:
                num_features = x_test.shape[-1]

                t_start = time.time()
                c_t_start = time.ctime()

                imp_table = Table(
                    names=['ID', 'FEATURE', 'MDA', 'Z-SCORE', 'IMPORTANCE'],
                    dtype=['uint8', 'S32', 'float32', 'float32', 'float32'])
                imp_table['ID'].format = 'd'
                imp_table['FEATURE'].format = '>s'
                imp_table['MDA'].format = ' 10.6f'
                imp_table['Z-SCORE'].format = ' 10.6f'
                imp_table['IMPORTANCE'].format = ' 10.6f'

                # Using the Boruta algorithm
                print("Feature importance analysis...")
                for i in range(num_features):
                    print("Feature {} of {}: ".format(i+1, num_features))
                    # backup the column data
                    orig_col = x_test[..., i].copy()
                    # shuffle the i-th column
                    np.random.shuffle(x_test[..., i])

                    # compute the new confusion_matrix
                    _, ith_cm = rf.test(x_test, y_test, model_dir)

                    # restore the column
                    x_test[..., i] = orig_col.copy()
                    del orig_col

                    # compute the average score
                    ith_scores = score(ith_cm)
                    m_f1 = np.mean(ith_scores, axis=0)[-1]
                    imp_table.add_row([i, x_names[i], base_f1 - m_f1, 0, 0])
                c_t_end = time.ctime()
                delta_t = datetime.timedelta(seconds=time.time()-t_start)

                # computing the Z-score
                imp_table['Z-SCORE'] = imp_table['MDA']
                imp_table['Z-SCORE'] -= imp_table['MDA'].mean()
                imp_table['Z-SCORE'] /= imp_table['MDA'].std()

                # Finding the Maximum Z Shadow Accuracy
                MSZA = max(
                    x['Z-SCORE'] for x in imp_table
                    if x['FEATURE'].startswith('__shadow_')
                )

                imp_table['IMPORTANCE'] = imp_table['Z-SCORE']/MSZA

                print("\nAnalysis results:")
                imp_table.sort('IMPORTANCE')
                imp_table.reverse()
                imp_table.pprint(align=['>', '>', '>', '>', '>'])

                with open(args.test_filename+'-fimportance.txt', 'w') as outf:
                    outf.write("FI starts at {}\n\n".format(c_t_start))
                    outf.write(str(imp_table))
                    outf.write("\n\nFI ends at {}\n".format(c_t_end))
                    outf.write("Elapsed time: {}\n".format(delta_t))
            del x_test
            del y_test
            print("")

        if do_run:
            print("Reading input dataset...")
            x_data, _, x_names, _ = readinput(
                args.run_filename,
                None,
                args.skip_list)
            pred = rf.run(x_data, model_dir)
            del x_data

            # Read the original file
            t = Table.read(args.run_filename)

            # Check if the name is already used and if this is the case
            # then use a different name
            newcol_name = 'PRED_CLASS'
            while newcol_name in t.colnames:
                newcol_name = '_'+newcol_name

            # add the predicted classes as last column in the dataset
            newcol = Column(pred, name=newcol_name)
            t.add_column(newcol)

            catname = os.path.splitext(args.run_filename)[0] + '-rfout.fits'
            t.write(catname, format='fits')
Пример #17
0
#!/mnt/data/anaconda3-cpu-mkl/bin/python

import pandas as pd
import tensorflow as tf
import numpy as np
import math

import time
from tensorflow.python.platform import tf_logging as logging

logging._get_logger().setLevel(logging.INFO)
start = time.clock()

# ### prameters to adjust:

# In[ ]:

hidden_units = [128, 64, 32]
learning_rate = 0.001
batch_size = 2000
num_epochs = 50
l1_regularization_strength = 0.001
hash_bucket_size = 200

#filenames = ["./ext_1.csv"]
filenames = ["hdfs://192.168.1.2:4545/census_extended/ext_1.csv"]
training_data_pandas = "sample.csv"  #to fectch feature name/dtypes and calculate mean & std for categorical columns.
target = 'income'
delim = ','
label_vocabulary = ["<=50K", ">50K"]
Пример #18
0
flags.DEFINE_boolean('eval_best_model', False, '')

flags.DEFINE_float('min_visl_detection_score', 0.05, '')

flags.DEFINE_boolean('run_once', False, '')
flags.DEFINE_boolean('eval_coco_on_voc', False, '')

flags.DEFINE_string('shard_indicator', '', '')

flags.DEFINE_string('input_pattern', '', '')

FLAGS = flags.FLAGS

try:
    logging._get_logger().propagate = False
except AttributeError:
    pass

STANDARD_COLORS = [
    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige',
    'Bisque', 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue',
    'AntiqueWhite', 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk',
    'Crimson', 'Cyan', 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki',
    'DarkOrange', 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise',
    'DarkViolet', 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick',
    'FloralWhite', 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold',
    'GoldenRod', 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory',
    'Khaki', 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon',
    'LightBlue', 'LightCoral', 'LightCyan', 'LightGoldenRodYellow',
    'LightGray', 'LightGrey', 'LightGreen', 'LightPink', 'LightSalmon',