atol_threshold = 3e-2

    hidden_dim = head_num * size_per_head
    initializer_range = 0.02

    sequence_length = np.random.randint(1, max_seq_len + 1,
                                        size=batch_size).astype(np.int32)
    if avg_seq_len != -1 and remove_padding == True:
        # This means we use "remove_padding" and set a smaller average sequence length
        sequence_length = np.ones(batch_size) * avg_seq_len

    from_data = np.random.randn(batch_size, max_seq_len, hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)

    attention_mask = build_sequence_mask(sequence_length,
                                         num_heads=head_num,
                                         maximum_length=max_seq_len,
                                         dtype=tf_datatype)

    encoder_notInt8_args = TransformerArgument(beam_width=1,
                                               head_num=head_num,
                                               size_per_head=size_per_head,
                                               num_layer=num_layer,
                                               dtype=tf_datatype,
                                               remove_padding=remove_padding,
                                               int8_mode=0)

    encoder_Int8_v1_args = TransformerArgument(beam_width=1,
                                               head_num=head_num,
                                               size_per_head=size_per_head,
                                               num_layer=num_layer,
                                               dtype=tf_datatype,
Exemplo n.º 2
0
def encoder_sample(args_dict):
    print("\n=============== Argument ===============")
    for key in args_dict:
        print("{}: {}".format(key, args_dict[key]))
    print("========================================")

    np.random.seed(1)
    tf.set_random_seed(1)

    batch_size = args_dict['batch_size']
    num_layer = args_dict['num_layer']
    max_seq_len = args_dict['max_seq_len']
    avg_seq_len = args_dict['avg_seq_len']
    head_num = args_dict['head_number']
    size_per_head = args_dict['size_per_head']
    tf_datatype = tf.float32
    np_datatype = np.float32
    atol_threshold = 3e-5
    int8_mode = args_dict['int8_mode']
    allow_gemm_test = True if args_dict['allow_gemm_test'].lower() == "true" else False
    if args_dict['data_type'] == "fp16":
        tf_datatype = tf.float16
        np_datatype = np.float16
        atol_threshold = 3e-2

    hidden_dim = head_num * size_per_head

    sequence_length = np.random.randint(1, max_seq_len + 1, size=batch_size)
    if avg_seq_len != -1:
        # This means we use "remove_padding" and set other average sequence length
        sequence_length = np.ones(batch_size) * avg_seq_len
    else:
        sequence_length = np.ones(batch_size) * (max_seq_len / 2)
    sequence_length = sequence_length.astype(np.int32)

    from_data = np.random.randn(batch_size, max_seq_len, hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)
    
    attention_mask = build_sequence_mask(sequence_length, num_heads=head_num, maximum_length=max_seq_len, dtype=tf_datatype)
    
    encoder_args = TransformerArgument(beam_width=1,
                                       head_num=head_num,
                                       size_per_head=size_per_head,
                                       num_layer=num_layer,
                                       dtype=tf_datatype,
                                       remove_padding=False,
                                       int8_mode=int8_mode,
                                       allow_gemm_test=allow_gemm_test)

    eff_encoder_args = copy.deepcopy(encoder_args)
    eff_encoder_args.remove_padding = True

    tf_encoder_result = tf_encoder(input_tensor=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask)

    encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    encoder_variables_dict = {}
    for v in encoder_vars:
        encoder_variables_dict[v.name] = v
    
    op_encoder_result = op_encoder(inputs=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask,
                                   encoder_vars_dict=encoder_variables_dict,
                                   sequence_length=sequence_length)

    eff_encoder_result = op_encoder(inputs=from_tensor,
                                    encoder_args=eff_encoder_args,
                                    attention_mask=attention_mask,
                                    encoder_vars_dict=encoder_variables_dict,
                                    sequence_length=sequence_length)

    '''
    Because FasterTransformer skip some computation for the padding parts, 
    if we do not mask these parts, the cross check result would be wrong. 
    '''
    tf_encoder_result = tf_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1)
    op_encoder_result = op_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1)
    eff_encoder_result = eff_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        for idx, name in enumerate(encoder_variables_dict):
            print((str(idx) + " " + str(name) + " " +
                   str(encoder_variables_dict[name].shape)) + " " + str(encoder_variables_dict[name].dtype))
            
        print("#################################")
        tf_encoder_result_val = sess.run(tf_encoder_result)
        op_encoder_result_val = sess.run(op_encoder_result)
        eff_encoder_result_val = sess.run(eff_encoder_result)

        cross_check("Encoder TF v.s. FT with tensor input", tf_encoder_result_val, op_encoder_result_val, atol_threshold)
        cross_check("Encoder TF v.s. EFF-FT with tensor input", tf_encoder_result_val, eff_encoder_result_val, atol_threshold)
        
        op_diff = abs(tf_encoder_result_val.reshape([-1]) - op_encoder_result_val.reshape([-1]))
        eff_diff = abs(tf_encoder_result_val.reshape([-1]) - eff_encoder_result_val.reshape([-1]))
        max_diff = max(op_diff.max(), eff_diff.max())

        ite = 50
        def _cond(from_tensor):
            return tf.constant(True)
            
        def _ft_body(from_tensor):
            op_encoder_result = op_encoder(inputs=from_tensor,
                                            encoder_args=encoder_args,
                                            attention_mask=attention_mask,
                                            encoder_vars_dict=encoder_variables_dict,
                                            sequence_length=sequence_length)
            return op_encoder_result

        def _eff_body(from_tensor):
            eff_encoder_result = op_encoder(inputs=from_tensor,
                                            encoder_args=eff_encoder_args,
                                            attention_mask=attention_mask,
                                            encoder_vars_dict=encoder_variables_dict,
                                            sequence_length=sequence_length)
            return eff_encoder_result

        def _tf_body(from_tensor):
            tf_encoder_result = tf_encoder(input_tensor=from_tensor,
                                            encoder_args=encoder_args,
                                            attention_mask=attention_mask)
            return tf_encoder_result

        tf_while_tensor = tf.while_loop(_cond,
                                        _tf_body,
                                        loop_vars=[from_tensor],
                                        back_prop=False,
                                        maximum_iterations=ite)

        ft_while_tensor = tf.while_loop(_cond,
                                        _ft_body,
                                        loop_vars=[from_tensor],
                                        back_prop=False,
                                        maximum_iterations=ite)

        eff_while_tensor = tf.while_loop(_cond,
                                         _eff_body,
                                         loop_vars=[from_tensor],
                                         back_prop=False,
                                         maximum_iterations=ite)

        if args_dict['test_time'] == 1:

            # tf_time = time_test(sess, tf_encoder_result, ite)
            # ft_time = time_test(sess, op_encoder_result, ite)
            # eff_time = time_test(sess, eff_encoder_result, ite)

            # Using while loop to run 'ite' times to ignore the overheads of memory copy and model preprocess.
            # We use these times as the profiling results.
            tf_while_time = time_test(sess, tf_while_tensor, 1) / ite # while_loop has run ite times
            time.sleep(60)
            ft_while_time = time_test(sess, ft_while_tensor, 1) / ite # while_loop has run ite times
            time.sleep(60)
            eff_while_time = time_test(sess, eff_while_tensor, 1) / ite # while_loop has run ite times
            time.sleep(60)
            
            ft_type = args_dict['data_type'].upper()
            if int8_mode != 0:
                ft_type = "INT8-v{}".format(int8_mode)
            
            # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer TF-time     {:6.2f} ms".format(batch_size, max_seq_len, args_dict['data_type'].upper(), num_layer, tf_time))
            # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer FT-OP-time  {:6.2f} ms".format(batch_size, max_seq_len, ft_type, num_layer, ft_time))
            # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer EFF-OP-time {:6.2f} ms".format(batch_size, max_seq_len, ft_type, num_layer, eff_time))

            print("[INFO] batch_size {} max_seq_len {} precision {} {} layer TF-while-time     {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, args_dict['data_type'].upper(), num_layer, tf_while_time, ite))
            print("[INFO] batch_size {} max_seq_len {} precision {} {} layer FT-OP-while-time  {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, ft_type, num_layer, ft_while_time, ite))
            print("[INFO] batch_size {} max_seq_len {} precision {} {} layer EFF-OP-while-time {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, ft_type, num_layer, eff_while_time, ite))


        if args_dict['thread_num'] > 1:
            # Multi-threading demonstration
            thread_list = []
            thread_num = args_dict['thread_num']
            def run():
                ft_while_time = time_test(sess, ft_while_tensor, 1) / ite # while_loop has run ite times
                print("[INFO] batch_size {} max_seq_len {} {} layer FT-OP-while-time {:6.2f} ms with {} threads".format(batch_size,
                    max_seq_len, num_layer, ft_while_time, thread_num))

            for i in range(thread_num):
                thread_list.append(threading.Thread(target=run, name="RunFT"))
            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join()

        return max_diff
Exemplo n.º 3
0
    def run_attn(self, batch_size, seq_len, head_num, size_per_head):
        hidden_dim = head_num * size_per_head
        np.random.seed(1)
        tf.set_random_seed(1)

        sequence_length = np.random.randint(1, seq_len + 1,
                                            size=batch_size).astype(np.int32)
        attention_mask = build_sequence_mask(sequence_length,
                                             num_heads=head_num,
                                             maximum_length=seq_len,
                                             dtype=tf.float16)

        q_input = np.random.rand(batch_size, seq_len,
                                 hidden_dim).astype(np.float16)
        k_input = np.random.rand(batch_size, seq_len,
                                 hidden_dim).astype(np.float16)
        v_input = np.random.rand(batch_size, seq_len,
                                 hidden_dim).astype(np.float16)

        query_layer = tf.convert_to_tensor(q_input, dtype=tf.float16)
        key_layer = tf.convert_to_tensor(k_input, dtype=tf.float16)
        value_layer = tf.convert_to_tensor(v_input, dtype=tf.float16)

        def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                                 seq_length, width):
            output_tensor = tf.reshape(
                input_tensor,
                [batch_size, seq_length, num_attention_heads, width])

            output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
            return output_tensor

        # `query_layer` = [B, N, F, H]
        query_layer = transpose_for_scores(query_layer, batch_size, head_num,
                                           seq_len, size_per_head)

        # `key_layer` = [B, N, T, H]
        key_layer = transpose_for_scores(key_layer, batch_size, head_num,
                                         seq_len, size_per_head)

        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        attention_scores = tf.multiply(attention_scores,
                                       1.0 / math.sqrt(float(size_per_head)))

        if attention_mask is not None:
            # `attention_mask` = [B, 1, F, T]
            if tf.rank(attention_mask) == 3:
                attention_mask = tf.expand_dims(attention_mask, axis=[1])

            adder = (1.0 - tf.cast(attention_mask, tf.float16)) * -10000.0

            attention_scores += adder

        attention_probs = tf.nn.softmax(attention_scores)

        value_layer = tf.reshape(
            value_layer, [batch_size, seq_len, head_num, size_per_head])

        value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

        context_layer = tf.matmul(attention_probs, value_layer)
        context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
        context_layer = tf.reshape(
            context_layer, [batch_size, seq_len, head_num * size_per_head])

        # remove the padding of tf output to compare with op output
        tf_output = []
        for i in range(batch_size):
            tf_output.append(context_layer[i][0:sequence_length[i]])
        tf_output = tf.concat(tf_output, axis=0)

        # Attention op
        fused_multihead_attention_op = tf.load_op_library(
            os.path.join('./lib/libtf_fused_multihead_attention.so'))

        # remove padding of q, k and v
        q_input_remove_pad = []
        for i in range(batch_size):
            q_input_remove_pad.append(q_input[i][0:sequence_length[i]])
        q_input_remove_pad = tf.concat(q_input_remove_pad, axis=0)

        k_input_remove_pad = []
        for i in range(batch_size):
            k_input_remove_pad.append(k_input[i][0:sequence_length[i]])
        k_input_remove_pad = tf.concat(k_input_remove_pad, axis=0)

        v_input_remove_pad = []
        for i in range(batch_size):
            v_input_remove_pad.append(v_input[i][0:sequence_length[i]])
        v_input_remove_pad = tf.concat(v_input_remove_pad, axis=0)

        qkv_input = tf.concat(
            [q_input_remove_pad, k_input_remove_pad, v_input_remove_pad],
            axis=1)
        qkv_input = tf.reshape(qkv_input, [-1, 3, head_num, size_per_head])
        qkv_input = tf.transpose(qkv_input, [0, 2, 1, 3])
        qkv_input = tf.reshape(qkv_input, [-1, 3 * head_num * size_per_head])

        op_output = fused_multihead_attention_op.multi_head_attention(
            qkv_input, np.cumsum(np.insert(sequence_length, 0, 0), axis=0),
            head_num, size_per_head, seq_len)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            print(batch_size, seq_len, head_num, size_per_head)

            sess.run(tf.global_variables_initializer())
            tf_result = sess.run(tf_output).flatten()
            op_result = sess.run(op_output).flatten()

            assert (abs(tf_result - op_result).max() < 0.03)
Exemplo n.º 4
0
def encoder_sample(args_dict):
    print("\n=============== Argument ===============")
    for key in args_dict:
        print("{}: {}".format(key, args_dict[key]))
    print("========================================")

    np.random.seed(1)
    tf.set_random_seed(1)

    batch_size = args_dict['batch_size']
    num_layer = args_dict['num_layer']
    max_seq_len = args_dict['max_seq_len']
    avg_seq_len = args_dict['avg_seq_len']
    head_num = args_dict['head_number']
    size_per_head = args_dict['size_per_head']
    remove_padding = True if args_dict['remove_padding'].lower(
    ) == "true" else False
    tf_datatype = tf.float32
    np_datatype = np.float32
    atol_threshold = 3e-5
    int8_mode = args_dict['int8_mode']
    allow_gemm_test = True if args_dict['allow_gemm_test'].lower(
    ) == "true" else False
    if args_dict['data_type'] == "fp16":
        tf_datatype = tf.float16
        np_datatype = np.float16
        atol_threshold = 3e-2

    hidden_dim = head_num * size_per_head

    sequence_length = np.random.randint(1, max_seq_len + 1, size=batch_size)
    if avg_seq_len != -1 and remove_padding == True:
        # This means we use "remove_padding" and set a smaller average sequence length
        sequence_length = np.ones(batch_size) * avg_seq_len
    else:
        sequence_length = np.ones(batch_size) * (max_seq_len / 2)
    sequence_length = sequence_length.astype(np.int32)

    from_data = np.random.randn(batch_size, max_seq_len, hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)

    attention_mask = build_sequence_mask(sequence_length,
                                         num_heads=head_num,
                                         maximum_length=max_seq_len,
                                         dtype=tf_datatype)

    encoder_args = TransformerArgument(beam_width=1,
                                       head_num=head_num,
                                       size_per_head=size_per_head,
                                       num_layer=num_layer,
                                       dtype=tf_datatype,
                                       remove_padding=remove_padding,
                                       int8_mode=int8_mode,
                                       allow_gemm_test=allow_gemm_test)

    tf_encoder_result = tf_encoder(input_tensor=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask)

    encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    encoder_variables_dict = {}
    for v in encoder_vars:
        encoder_variables_dict[v.name] = v

    op_encoder_result = op_encoder(inputs=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask,
                                   encoder_vars_dict=encoder_variables_dict,
                                   sequence_length=sequence_length)
    '''
    Because FasterTransformer skip some computation for the padding parts, 
    if we do not mask these parts, the cross check result would be wrong. 
    '''
    tf_encoder_result = tf_encoder_result * tf.expand_dims(tf.sequence_mask(
        sequence_length, maxlen=max_seq_len, dtype=tf_datatype),
                                                           axis=-1)
    op_encoder_result = op_encoder_result * tf.expand_dims(tf.sequence_mask(
        sequence_length, maxlen=max_seq_len, dtype=tf_datatype),
                                                           axis=-1)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()

        for idx, name in enumerate(encoder_variables_dict):
            print((str(idx) + " " + str(name) + " " +
                   str(encoder_variables_dict[name].shape)) + " " +
                  str(encoder_variables_dict[name].dtype))

        print("#################################")
        tf_encoder_result_val = sess.run(tf_encoder_result)
        op_encoder_result_val = sess.run(op_encoder_result)

        cross_check("Encoder TF v.s. FT with tensor input",
                    tf_encoder_result_val, op_encoder_result_val,
                    atol_threshold)
        ''' 
            Use the numpy array as inputs of FasterTransformer OP. 
            
            This method require more time for the op initialization (especially for FP16), 
            but the inference time would be little faster than using tensor as input. 
        '''
        encoder_variables_dict_2 = {}
        for var, val in zip(encoder_vars, sess.run(encoder_vars)):
            encoder_variables_dict_2[var.name] = val

        # op_encoder_result_2 = op_encoder(inputs=from_tensor,
        #                                 encoder_args=encoder_args,
        #                                 attention_mask=attention_mask,
        #                                 encoder_vars_dict=encoder_variables_dict_2,
        #                                 sequence_length=sequence_length)
        # op_encoder_result_val_2 = sess.run(op_encoder_result_2)
        # cross_check("Encoder TF v.s. FT with numpy input", tf_encoder_result_val,
        #             op_encoder_result_val_2, atol_threshold)

        if args_dict['test_time'] == 1:

            ite = 50
            tf_time = time_test(sess, tf_encoder_result, ite)
            op_time = time_test(sess, op_encoder_result, ite)
            # op_time_2 = time_test(sess, op_encoder_result_2, ite)

            print(
                "[INFO] batch_size {} max_seq_len {} {} layer TF-time {:6.2f} ms"
                .format(batch_size, max_seq_len, num_layer, tf_time))
            print(
                "[INFO] batch_size {} max_seq_len {} {} layer FT-OP-tensor-time {:6.2f} ms"
                .format(batch_size, max_seq_len, num_layer, op_time))
            # print("[INFO] batch_size {} max_seq_len {} {} layer FT-OP-numpy-time {:6.2f} ms".format(batch_size, max_seq_len, num_layer, op_time_2))

        return (tf_encoder_result_val.reshape([-1]) -
                op_encoder_result_val.reshape([-1])).max()