예제 #1
0
def map_func(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print ("cluster:" + str(cluster_json))
    print ("job name:" + job_name)
    print ("current index:" + str(index))
    sys.stdout.flush()
    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=index)
    sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False,
                                 device_filters=["/job:ps", "/job:worker/task:%d" % index])
    t = time.time()
    if 'ps' == job_name:
        from time import sleep
        while True:
            sleep(1)
    else:
        with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:' + str(index), cluster=cluster)):
            train_ops = build_graph()
            print("python worker index:" + str(index))
            sys.stdout.flush()
            try:
                hooks = [tf.train.StopAtStepHook(last_step=2)]
                with tf.train.MonitoredTrainingSession(master=server.target, config=sess_config,
                                                       checkpoint_dir="./target/tmp/s1/" + str(t),
                                                       hooks=hooks) as mon_sess:
                    while not mon_sess.should_stop():
                        print (mon_sess.run(train_ops, feed_dict={a: [1.0, 2.0, 3.0]}))
                        sys.stdout.flush()
                        time.sleep(1)
            finally:
                SummaryWriterCache.clear()
예제 #2
0
def map_fun(context):
    tf.compat.v1.disable_v2_behavior()
    print(tf.__version__)
    sys.stdout.flush()
    tf.logging.set_verbosity(tf.logging.ERROR)
    jobName = context.jobName
    index = context.index
    clusterStr = context.properties["cluster"]
    delim = context.properties["SYS:delim"]
    epochs = int(context.properties["epochs"])
    data_file = context.properties["data.file"]
    print(index, clusterStr)
    sys.stdout.flush()
    clusterJson = json.loads(clusterStr)
    cluster = tf.compat.v1.train.ClusterSpec(cluster=clusterJson)
    server = tf.compat.v1.train.Server(cluster,
                                       job_name=jobName,
                                       task_index=index)
    sess_config = tf.compat.v1.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % index])

    with tf.compat.v1.device(
            tf.compat.v1.train.replica_device_setter(
                worker_device='/job:worker/task:' + str(index),
                cluster=cluster)):
        filename_queue = tf.train.string_input_producer([data_file],
                                                        num_epochs=epochs)
        reader = tf.TextLineReader()
        key, value = reader.read(filename_queue)
        global_step = tf.compat.v1.train.get_or_create_global_step()
        global_step_inc = tf.compat.v1.assign_add(global_step, 1)
        is_chief = (index == 0)
        print(datetime.now().isoformat() +
              " started ------------------------------------")
        t = time.time()
        total_step = 0
        try:
            with tf.compat.v1.train.MonitoredTrainingSession(
                    master=server.target,
                    is_chief=is_chief,
                    config=sess_config,
                    checkpoint_dir="./target/tmp/input_output/" +
                    str(t)) as mon_sess:
                # while not mon_sess.should_stop():
                while True:
                    total_step, _, _ = mon_sess.run(
                        [global_step_inc, key, value])
                    if (total_step % 10000 == 0):
                        log_speed(total_step, t)
        except Exception as e:
            print('traceback.print_exc():')
            traceback.print_exc()
            sys.stdout.flush()
        finally:
            print(datetime.now().isoformat() +
                  " ended --------------------------------------")
            log_speed(total_step, t)
            SummaryWriterCache.clear()
예제 #3
0
def map_fun(context):
    print(tf.__version__)
    sys.stdout.flush()
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print(cluster_json)
    sys.stdout.flush()
    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=index)
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % index])
    if 'ps' == job_name:
        from time import sleep
        while True:
            sleep(1)
    else:
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device='/job:worker/task:' + str(index),
                    cluster=cluster)):
            record_defaults = [[9], [tf.constant(value=9, dtype=tf.int64)],
                               [9.0],
                               [tf.constant(value=9.0, dtype=tf.float64)],
                               ["9.0"]]
            dataset = context.flinkStreamDataSet(buffer_size=0)
            dataset = dataset.map(lambda record: tf.decode_csv(
                record, record_defaults=record_defaults))
            dataset = dataset.batch(3)
            iterator = dataset.make_one_shot_iterator()
            input_records = iterator.get_next()

            global_step = tf.train.get_or_create_global_step()
            global_step_inc = tf.assign_add(global_step, 1)
            out_list = [input_records[0], input_records[2], input_records[4]]
            out = tff_ops.encode_csv(input_list=out_list)
            is_chief = (index == 0)
            t = time.time()
            try:
                with tf.train.MonitoredTrainingSession(
                        master=server.target,
                        is_chief=is_chief,
                        config=sess_config,
                        checkpoint_dir="./target/tmp/input_output/" +
                        str(t)) as mon_sess:
                    # while not mon_sess.should_stop():
                    while True:
                        print(index, mon_sess.run([global_step_inc, out]))
                        sys.stdout.flush()
                        # time.sleep(1)
            except Exception as e:
                print('traceback.print_exc():')
                traceback.print_exc()
                sys.stdout.flush()
            finally:
                SummaryWriterCache.clear()
def map_fun(context):
    print(tf.__version__)
    sys.stdout.flush()
    tf.logging.set_verbosity(tf.logging.ERROR)
    jobName = context.jobName
    index = context.index
    clusterStr = context.properties["cluster"]
    delim = context.properties["SYS:delim"]
    print(index, clusterStr)
    sys.stdout.flush()
    clusterJson = json.loads(clusterStr)
    cluster = tf.train.ClusterSpec(cluster=clusterJson)
    server = tf.train.Server(cluster, job_name=jobName, task_index=index)
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % index])
    with tf.device(
            tf.train.replica_device_setter(worker_device='/job:worker/task:' +
                                           str(index),
                                           cluster=cluster)):
        dataset = context.flinkStreamDataSet(buffer_size=0)
        iterator = dataset.make_one_shot_iterator()
        input_records = iterator.get_next()

        global_step = tf.contrib.framework.get_or_create_global_step()
        global_step_inc = tf.assign_add(global_step, 1)
        is_chief = (index == 0)
        print(datetime.now().isoformat() +
              " started ------------------------------------")
        t = time.time()
        total_step = 0
        try:
            with tf.train.MonitoredTrainingSession(
                    master=server.target,
                    is_chief=is_chief,
                    config=sess_config,
                    checkpoint_dir="./target/tmp/input_output/" +
                    str(t)) as mon_sess:
                # while not mon_sess.should_stop():
                while True:
                    total_step, _ = mon_sess.run(
                        [global_step_inc, input_records])
                    if (total_step % 10000 == 0):
                        log_speed(total_step, t)
        except Exception as e:
            print('traceback.print_exc():')
            traceback.print_exc()
            sys.stdout.flush()
        finally:
            print(datetime.now().isoformat() +
                  " ended --------------------------------------")
            log_speed(total_step, t)
            SummaryWriterCache.clear()
예제 #5
0
def map_fun(context):
    tf.compat.v1.disable_v2_behavior()
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print(cluster_json)
    sys.stdout.flush()
    cluster = tf.compat.v1.train.ClusterSpec(cluster=cluster_json)
    server = tf.compat.v1.train.Server(cluster,
                                       job_name=job_name,
                                       task_index=index)
    sess_config = tf.compat.v1.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % index])
    if 'ps' == job_name:
        from time import sleep
        while True:
            sleep(1)
    else:
        with tf.compat.v1.device(
                tf.compat.v1.train.replica_device_setter(
                    worker_device='/job:worker/task:' + str(index),
                    cluster=cluster)):

            global_step = tf.compat.v1.train.get_or_create_global_step()
            global_step_inc = tf.compat.v1.assign_add(global_step, 1)
            input_records = [
                tf.constant([1, 2, 3]),
                tf.constant([1.0, 2.0, 3.0]),
                tf.constant(['1.0', '2.0', '3.0'])
            ]
            out = tff_ops.encode_csv(input_list=input_records, field_delim='|')
            fw = tff_ops.FlinkTFRecordWriter(address=context.toFlink())
            w = fw.write([out])
            is_chief = (index == 0)
            t = time.time()
            try:
                hooks = [tf.compat.v1.train.StopAtStepHook(last_step=50)]
                with tf.compat.v1.train.MonitoredTrainingSession(
                        master=server.target,
                        config=sess_config,
                        is_chief=is_chief,
                        checkpoint_dir="./target/tmp/with_output/" + str(t),
                        hooks=hooks) as mon_sess:
                    while not mon_sess.should_stop():
                        print(index, mon_sess.run([global_step_inc, w]))
                        sys.stdout.flush()
                        time.sleep(1)
            finally:
                SummaryWriterCache.clear()
def map_fun(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    task_index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print (cluster_json)
    sys.stdout.flush()

    props = context.properties
    batch_size = int(props.get("batch_size"))
    checkpoint_dir = props.get("checkpoint_dir")
    export_dir = props.get("export_dir")

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    cluster = tf.train.ClusterSpec(cluster=cluster_json)
    server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)

    def feed_dict(images, labels):
        xs = numpy.array(images)
        xs = xs.astype(numpy.float32)
        xs = xs / 255.0
        ys = numpy.array(labels)
        ys = ys.astype(numpy.uint8)
        return (xs, ys)

    if job_name == "ps":
        from time import sleep
        while True:
            sleep(1)
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(worker_device="/job:worker/task:" + str(task_index), cluster=cluster)):

            # Placeholders or QueueRunner/Readers for input data
            x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
            y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

            # Variables of the hidden layer
            hid_w = tf.Variable(
                tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS),
                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            # Variables of the softmax layer
            sm_w = tf.Variable(
                tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)),
                name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.train.get_or_create_global_step()

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

            train_op = tf.train.AdagradOptimizer(0.01).minimize(loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")

            iter = input_iter(tf_context, batch_size)
            next_batch = iter.get_next()

            is_chief = (task_index == 0)
            sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False,
                                         device_filters=["/job:ps", "/job:worker/task:%d" % task_index])

            # The MonitoredTrainingSession takes care of session initialization, restoring from
            #  a checkpoint, and closing when done or an error occurs
            mon_sess = tf.train.MonitoredTrainingSession(master=server.target, is_chief=is_chief,
                                                         checkpoint_dir=checkpoint_dir,
                                                         stop_grace_period_secs=10, max_wait_secs=300,
                                                         config=sess_config,
                                                         chief_only_hooks=[ExportHook(export_dir, x,
                                                                                      prediction)])
            processed = 0
            while not mon_sess.should_stop():
                # Run a training step asynchronously
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                try:
                    images, labels = mon_sess.run(next_batch)
                    processed += images.shape[0]
                    # print mon_sess.run(next_batch)
                except tf.errors.OutOfRangeError:
                    break

                batch_xs, batch_ys = feed_dict(images, labels)
                feed = {x: batch_xs, y_: batch_ys}

                if len(batch_xs) > 0 and not mon_sess.should_stop():
                    _, step = mon_sess.run([train_op, global_step], feed_dict=feed)
                    if step % 100 == 0:
                        print("{0}, Task {1} step: {2} accuracy: {3}".format(
                            datetime.now().isoformat(), task_index, step,
                            mon_sess.run(accuracy, {x: batch_xs, y_: batch_ys})))
                        sys.stdout.flush()

            print(str(processed) + " records processed.")
            print("{0} stopping MonitoredTrainingSession".format(datetime.now().isoformat()))
            mon_sess.close()

    SummaryWriterCache.clear()
def map_fun(context):
    tf_context = TFContext(context)
    job_name = tf_context.get_role_name()
    task_index = tf_context.get_index()
    cluster_json = tf_context.get_tf_cluster()
    print(cluster_json)
    sys.stdout.flush()

    props = context.properties
    batch_size = int(props.get("batch_size"))
    epochs = int(props.get("epochs"))
    checkpoint_dir = props.get("checkpoint_dir")
    export_dir = props.get("export_dir")

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    session = tf.Session()
    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
    input_key = 'image'
    output_key = 'prediction'
    # saved_model_dir = '/home/chen/code/TensorFlowOnFlink/target/export/1539071075170/'
    test_log("before load")
    meta_graph_def = tf.saved_model.loader.load(
        session, [tf.saved_model.tag_constants.SERVING], export_dir=export_dir)
    test_log("after load")
    signature = meta_graph_def.signature_def

    x_tensor_name = signature[signature_key].inputs[input_key].name
    test_log(x_tensor_name)
    y_tensor_name = signature[signature_key].outputs[output_key].name
    test_log(y_tensor_name)
    x = session.graph.get_tensor_by_name(x_tensor_name)
    y = session.graph.get_tensor_by_name(y_tensor_name)

    # write_feed, write_op, close_op = context.getOutputWriterOp()
    write_feed = tf.placeholder(dtype=tf.string)
    write_op, close_op = tf_context.output_writer_op([write_feed])
    iter = input_iter(tf_context, batch_size)
    next_batch = iter.get_next()
    prediction = tf.argmax(next_batch[1], 1, name="prediction")

    while True:
        try:
            images, labels, labels_ = session.run(
                [next_batch[0], next_batch[1], prediction])
        except tf.errors.OutOfRangeError:
            break
        batch_xs, batch_ys = feed_dict(images, labels)
        feed = {x: batch_xs}
        # test_log(feed_data)
        # input_res = session.run(x_, feed_dict={input_data: feed_data})
        # print "input_data", input_res
        y_res = session.run(y, feed_dict=feed)
        # print "y_res", y_res, "y_org", labels_
        # sys.stdout.flush()
        for i in range(len(y_res)):
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'predict_label':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[y_res[i]])),
                    'label_org':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[labels_[i]])),
                }))
            # print "write:", i
            sys.stdout.flush()
            session.run(write_op,
                        feed_dict={write_feed: example.SerializeToString()})

    session.run(close_op)

    SummaryWriterCache.clear()
예제 #8
0
        dataset = context.flinkStreamDataSet(buffer_size=0)
        iterator = dataset.make_one_shot_iterator()
        input_records = iterator.get_next()

        global_step = tf.contrib.framework.get_or_create_global_step()
        global_step_inc = tf.assign_add(global_step, 1)
        is_chief = (index == 0)
        print (datetime.now().isoformat() + " started ------------------------------------")
        t = time.time()
        total_step = 0
        try:
            with tf.train.MonitoredTrainingSession(master=server.target, is_chief=is_chief, config=sess_config,
                                                   checkpoint_dir="./target/tmp/input_output/" + str(t)) as mon_sess:
                # while not mon_sess.should_stop():
                while True:
                    total_step, _ = mon_sess.run([global_step_inc, input_records])
                    if (total_step % 10000 == 0):
                        log_speed(total_step, t)
        except Exception, e:
            print 'traceback.print_exc():'
            traceback.print_exc()
            sys.stdout.flush()
        finally:
            print (datetime.now().isoformat() + " ended --------------------------------------")
            log_speed(total_step, t)
            SummaryWriterCache.clear()


if __name__ == "__main__":
    map_fun(context)