예제 #1
0
def transformation():
    """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
    it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
    just means one prediction per line, since there's a single column.
    """
    data = None

    # Convert from CSV to pandas
    if flask.request.content_type == 'text/csv':
        data = flask.request.data.decode('utf-8')
        #s = StringIO.StringIO(data)
        s = StringIO(data)
        df = pd.read_csv(s) #, header=None
    else:
        return flask.Response(response='This predictor only supports CSV data',
                              status=415, mimetype='text/plain')
    
    # Read file path from input
    bucket = df.loc[0, 'bucket']
    s3_file_path = df.loc[0, 'file_path']
    pred_file_path = 'gluonts_ds.pkl' #os.environ['SM_CHANNEL_PRED']
    
    #print('==================>', bucket, s3_file_path)
    
    # Download s3 pred file
    try:
        print('Inside try...')
        ut.download_file_from_S3(bucket, s3_file_path, pred_file_path)
    except:
        print('Inside pass...')
        pass
    
    print('current dir:', os.listdir("."))
    print('model', os.listdir(model_path))  
    
    # Read into a gluonts dataset
    # pp.format_cutoff_train_data(pred_dir, cutoff_week, config)
    pred_ds = mdl.train_input_fn(pred_file_path)
    
    # Do the prediction
    predictions = ScoringService.predict(pred_ds)

    # Convert from numpy back to CSV
    #out = StringIO.StringIO()
    out = StringIO()
    pd.DataFrame({'results':predictions}).to_csv(out, header=False, index=False)
    result = out.getvalue()

    return flask.Response(response=result, status=200, mimetype='text/csv')
예제 #2
0
import os
import tensorflow as tf

TRAINING_STEPS = 1000
EVAL_STEPS = 100

from model import train_input_fn, eval_input_fn, model_fn, eval_on_train_data_input_fn

print('The path to read files')

trainfolderpath = os.path.join(os.environ['DATASPINE_INPUT_PATH'], 'training')

print(trainfolderpath)
train_func = train_input_fn(trainfolderpath, "training.csv")

estimator = tf.estimator.Estimator(model_fn=model_fn)

estimator.train(input_fn=train_func, steps=TRAINING_STEPS)

# Export the prepared model
from model import serving_input_fn

serving_func = serving_input_fn(hyperparameters={})

export_path = os.environ['DATASPINE_OUTPUT_PATH']
exported_model = estimator.export_savedmodel(
    export_dir_base=export_path, serving_input_receiver_fn=serving_func)

print('')
print(exported_model)
예제 #3
0
def test():
    #Dropout rate is 0 in test setup?
    BATCH_SIZE = args.batch_size
    EMBED_DIM = args.embeddingDim
    MAXLEN = args.maxLen
    NUM_UNITS = args.units
    CKPT = args.checkpoint
    LEARNING_RATE = args.learning_rate
    EPOCH = 1
    DROPOUT = 0
    start_word = "<s>"
    end_word = "</s>"

    test_source_tensor, test_source_tokenizer, test_target_tensor, test_target_tokenizer = \
        load_test_data(test_translate_from=r"./data/newstest2015.en", test_translate_to=r'./data/newstest2015.de',
           vocab_from=r'./data/vocab.50K.en', vocab_to=r'./data/vocab.50K.de',
           pad_length=90, limit=args.limit)
    #test_source_tensor, test_source_tokenizer, test_target_tensor, test_target_tokenizer = \
    #    load_data(pad_length = MAXLEN, limit=None)
    print(len(test_source_tensor))
    vocab_source_size = len(test_source_tokenizer.word_index) + 1
    print("vocab_input_size: ", vocab_source_size)
    vocab_target_size = len(test_target_tokenizer.word_index) + 1
    print("vocab_target_size: ", vocab_target_size)
    optimizer = tf.optimizers.Adam(learning_rate=LEARNING_RATE)
    buffer_size = len(test_source_tensor)

    test_steps = len(test_source_tensor) // BATCH_SIZE
    #print(test_source_tensor[0])
    #print("Type: ",type(test_source_tensor))
    #for ele in test_target_tensor:
    #    print(type(ele))
    #print("dir of test_target__token: ",dir(test_target_tokenizer))
    #print(test_target_tokenizer.index_word)
    dataset = train_input_fn(test_source_tensor, test_target_tensor,
                             buffer_size, EPOCH, BATCH_SIZE)

    # apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    encoder = Encoder(vocab_source_size,
                      EMBED_DIM,
                      NUM_UNITS,
                      dropout_rate=DROPOUT,
                      batch_size=BATCH_SIZE)
    decoder = Decoder(vocab_target_size,
                      EMBED_DIM,
                      NUM_UNITS,
                      batch_size=BATCH_SIZE,
                      method=None,
                      dropout_rate=DROPOUT)
    optimizer = tf.compat.v1.train.GradientDescentOptimizer(
        learning_rate=0.001)
    ckpt = tf.train.Checkpoint(optimizer=optimizer,
                               encoder=encoder,
                               decoder=decoder)
    manager = tf.train.CheckpointManager(ckpt, args.checkpoint, max_to_keep=10)
    ckpt.restore(manager.latest_checkpoint)
    per_epoch_loss, per_epoch_plex = 0, 0

    def test_wrapper(source, target):
        # source_out, source_state, source_trainable_var, tape = encoder(source, encoder_state, vocab_source_size,
        #                                                          EMBED_DIM, NUM_UNITS, activation="tanh",
        #                                                          dropout_rate = DROPOUT)
        result = ""
        source_out, source_state = encoder(source,
                                           encoder_state,
                                           activation="tanh")

        initial = tf.expand_dims(
            [test_target_tokenizer.word_index[start_word]] * BATCH_SIZE, 1)
        attention_state = tf.zeros((BATCH_SIZE, 1, EMBED_DIM))
        apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
        # cur_total_loss is a sum of loss for current steps, namely batch loss
        cur_total_loss, cur_total_plex, cur_loss = 0, 0, 0
        #print("word_index: ", test_target_tokenizer.word_index,len(test_target_tokenizer.word_index))
        #print("index_word: ",test_target_tokenizer.index_word)
        for i in range(target.shape[1]):
            output_state, source_state, attention_state = decoder(
                initial, source_state, source_out, attention_state)
            # TODO: check for the case where target is 0
            # 0 should be the padding value in target.
            # I assumed that there should not be 0 value in target
            # for safety reason, we apply this mask to final loss
            # Mask is a array contains binary value(0 or 1)
            #print(output_state.numpy().shape)
            #print(output_state[0].numpy())
            cur_loss = apply_loss(target[:, i], output_state)
            perplex = tf.nn.sparse_softmax_cross_entropy_with_logits(
                target[:, i], output_state)
            current_ind = tf.argmax(output_state[0])
            mask = tf.math.logical_not(tf.math.equal(target[:, i], 0))
            mask = tf.cast(mask, dtype=cur_loss.dtype)
            cur_loss *= mask
            perplex *= mask
            cur_total_loss += tf.reduce_mean(cur_loss)
            cur_total_plex += tf.reduce_mean(perplex)
            #tf.print("check current id: ",current_ind)
            #tf.print(test_target_tokenizer.index_word[29])
            #print(current_ind.numpy())
            if current_ind.numpy() == 0:
                # 0 is for pad value, we don't need to record it
                continue
            result += test_target_tokenizer.index_word[current_ind.numpy()]
            if test_target_tokenizer.index_word[
                    current_ind.numpy()] == end_word:
                break
            initial = tf.expand_dims(target[:, i], 1)
        batch_loss = cur_total_loss / target.shape[1]
        batch_perplex = cur_total_plex / target.shape[1]
        return batch_loss, batch_perplex, result

    encoder_hidden = encoder.initialize_hidden_state()
    encoder_ceil = encoder.initialize_cell_state()
    encoder_state = [[encoder_hidden, encoder_ceil],
                     [encoder_hidden, encoder_ceil],
                     [encoder_hidden, encoder_ceil],
                     [encoder_hidden, encoder_ceil]]
    # TODO : Double check to make sure all re-initialization is performed
    result_by_batch = []
    for idx, data in tqdm(enumerate(dataset.take(test_steps)),
                          total=args.limit):
        source, target = data
        batch_loss, batch_perplex, result = test_wrapper(source, target)
        with open("checkpoint/test_logger.txt", "a") as filelogger:
            print("The validation loss in batch " + str(idx) + " is : ",
                  str(batch_loss.numpy() / (idx + 1.0)),
                  file=filelogger)
            print("The validation perplex in batch " + str(idx) + " is : ",
                  str(batch_perplex.numpy() / (idx + 1.0)),
                  file=filelogger)
        per_epoch_loss += batch_loss
        per_epoch_plex += batch_perplex
        assert type(result) == str
        result_by_batch.append(result)
        #if idx>=3:
        #    break
    with open("checkpoint/test_logger.txt", "a") as filelogger:
        print("The validation loss is : ",
              str(per_epoch_loss.numpy() / (idx + 1.0)),
              file=filelogger)
        print("The validation perplex is: ",
              str(tf.exp(per_epoch_plex).numpy() / (idx + 1.0)),
              file=filelogger)
    return test_target_tokenizer, result_by_batch
예제 #4
0
def train():
    # args is a global variable in this task
    BATCH_SIZE = args.batch_size
    EPOCH = args.epoch
    EMBED_DIM = args.embeddingDim
    MAXLEN = args.maxLen
    NUM_UNITS = args.units
    LEARNING_RATE = args.learning_rate
    DROPOUT = args.dropout
    METHOD = args.method
    GPUNUM = args.gpuNum
    CKPT = args.checkpoint
    LIMIT = args.limit
    start_word = "<s>"
    end_word = "</s>"
    #Here, tokenizer saves all info to split data.
    #Itself is not a part of data.
    train_source_tensor, train_source_tokenizer, train_target_tensor, train_target_tokenizer = \
        load_data(pad_length = MAXLEN, limit=LIMIT)
    buffer_size = len(train_source_tensor)
    train_source_tensor, val_source_tensor, train_target_tensor, val_target_tensor = \
        train_test_split(train_source_tensor, train_target_tensor, random_state=2019)

    #TODO: check if we need target tokenizer
    training_steps = len(train_source_tensor) // BATCH_SIZE
    vocab_source_size = len(train_source_tokenizer.word_index) + 1
    print("vocab_input_size: ", vocab_source_size)
    vocab_target_size = len(train_target_tokenizer.word_index) + 1
    print("vocab_target_size: ", vocab_target_size)

    step = tf.Variable(0, trainable=False)
    # boundaries = [100, 200]
    # values = [1.0, 0.5, 0.1]
    # boundaries = [30, 40]
    # values = [1.0, 0.5, 0.0]
    # learning_rate_fn = tf.compat.v1.train.piecewise_constant(step,
    #     boundaries, values)
    # optimizer = tf.optimizers.SGD(learning_rate=learning_rate_fn(step))
    optimizer = tf.compat.v1.train.GradientDescentOptimizer(
        learning_rate=0.001)
    # set up checkpoint
    if not os.path.exists(CKPT):
        os.makedirs(CKPT)
    else:
        print(
            "Warning: current Checkpoint dir already exist! ",
            "\nPlease consider to choose a new dir to save your checkpoint!")
    checkpoint = tf.train.Checkpoint(optimzier=optimizer)
    checkpoint_prefix = os.path.join(CKPT, "ckpt")

    dataset = train_input_fn(train_source_tensor, train_target_tensor,
                             buffer_size, EPOCH, BATCH_SIZE)
    apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    encoder = Encoder(vocab_source_size,
                      EMBED_DIM,
                      NUM_UNITS,
                      dropout_rate=DROPOUT,
                      batch_size=BATCH_SIZE)
    decoder = Decoder(vocab_target_size,
                      EMBED_DIM,
                      NUM_UNITS,
                      batch_size=BATCH_SIZE,
                      method=None,
                      dropout_rate=DROPOUT)

    def train_wrapper(source, target):
        # with tf.GradientTape(watch_accessed_variables=False) as tape:
        with tf.GradientTape() as tape:
            # source_out, source_state, source_trainable_var, tape = encoder(source, encoder_state, vocab_source_size,
            #                                                          EMBED_DIM, NUM_UNITS, activation="tanh",
            #                                                          dropout_rate = DROPOUT)
            source_out, source_state = encoder(source,
                                               encoder_state,
                                               activation="tanh")

            initial = tf.expand_dims(
                [train_target_tokenizer.word_index[start_word]] * BATCH_SIZE,
                1)
            attention_state = tf.zeros((BATCH_SIZE, 1, EMBED_DIM))
            # cur_total_loss is a sum of loss for current steps, namely batch loss
            cur_total_loss, cur_loss = 0, 0
            for i in range(1, target.shape[1]):
                output_state, source_state, attention_state = decoder(
                    initial, source_state, source_out, attention_state)
                # TODO: check for the case where target is 0
                cur_loss = apply_loss(target[:, i], output_state)
                # 0 should be the padding value in target.
                # I assumed that there should not be 0 value in target
                # for safety reason, we apply this mask to final loss
                # Mask is a array contains binary value(0 or 1)
                mask = tf.math.logical_not(tf.math.equal(target[:, i], 0))
                mask = tf.cast(mask, dtype=cur_loss.dtype)
                cur_loss *= mask
                cur_total_loss += tf.reduce_mean(cur_loss)
                initial = tf.expand_dims(target[:, i], 1)
                # print(cur_loss)
                # print(cur_total_loss)
        batch_loss = cur_total_loss / target.shape[1]
        ## debug
        variables = encoder.trainable_variables + decoder.trainable_variables
        # print("check variable: ", len(variables))
        #variables = encoder.trainable_variables
        # print("check var:", len(variables), variables[12:])
        gradients = tape.gradient(cur_total_loss, variables)
        # print("check gradient: ", len(gradients))
        # g_e = [type(ele) for ele in gradients if not isinstance(ele, tf.IndexedSlices)]
        # sum_g = [ele.numpy().sum() for ele in gradients if not isinstance(ele, tf.IndexedSlices)]

        # print(len(gradients), len(sum_g))
        optimizer.apply_gradients(zip(gradients, variables), global_step=step)
        return batch_loss

    # print(len(train_source_tensor),BATCH_SIZE,training_steps,LIMIT)
    for epoch in range(EPOCH):
        per_epoch_loss = 0
        start = time.time()
        encoder_hidden = encoder.initialize_hidden_state()
        encoder_ceil = encoder.initialize_cell_state()
        encoder_state = [[encoder_hidden, encoder_ceil],
                         [encoder_hidden, encoder_ceil],
                         [encoder_hidden, encoder_ceil],
                         [encoder_hidden, encoder_ceil]]
        # TODO : Double check to make sure all re-initialization is performed
        for idx, data in enumerate(dataset.take(training_steps)):

            source, target = data
            cur_total_loss = train_wrapper(source, target)
            per_epoch_loss += cur_total_loss
            if idx % 10 == 0:
                # print("current step is: "+str(tf.compat.v1.train.get_global_step()))
                # print(dir(optimizer))
                print("current learning rate is:" +
                      str(optimizer._learning_rate))
                print('Epoch {}/{} Batch {}/{} Loss {:.4f}'.format(
                    epoch + 1, EPOCH, idx + 10, training_steps,
                    cur_total_loss.numpy()))
                # tf.print(step)
                # print(dir(step))
                # print(int(step))
            if step >= 5:
                optimizer._learning_rate /= 2.0

        print('Epoch {}/{} Total Loss per epoch {:.4f} - {} sec'.format(
            epoch + 1, EPOCH, per_epoch_loss / training_steps,
            time.time() - start))
        # TODO: for evaluation add bleu score
        if epoch % 10 == 0:
            print('Saving checkpoint for each 10 epochs')
            checkpoint.save(file_prefix=checkpoint_prefix)