예제 #1
0
def test_training_classification(spark_context, mode, parameter_server_mode,
                                 mnist_data, classification_model):
    # Define basic parameters
    batch_size = 64
    epochs = 10

    # Load data
    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]

    sgd = SGD(lr=0.1)
    classification_model.compile(sgd, 'categorical_crossentropy', ['acc'])

    # Build RDD from numpy features and labels
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    # Initialize SparkModel from keras model and Spark context
    spark_model = SparkModel(classification_model,
                             frequency='epoch',
                             mode=mode,
                             parameter_server_mode=parameter_server_mode,
                             port=4000 + random.randint(0, 500))

    # Train Spark model
    spark_model.fit(rdd,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=0,
                    validation_split=0.1)

    # run inference on trained spark model
    predictions = spark_model.predict(x_test)
    # run evaluation on trained spark model
    evals = spark_model.evaluate(x_test, y_test)

    # assert we can supply rdd and get same prediction results when supplying numpy array
    test_rdd = spark_context.parallelize(x_test)
    assert [np.argmax(x) for x in predictions
            ] == [np.argmax(x) for x in spark_model.predict(test_rdd)]

    # assert we get the same prediction result with calling predict on keras model directly
    assert [np.argmax(x) for x in predictions] == [
        np.argmax(x) for x in spark_model.master_network.predict(x_test)
    ]

    # assert we get the same evaluation results when calling evaluate on keras model directly
    assert isclose(evals[0],
                   spark_model.master_network.evaluate(x_test, y_test)[0],
                   abs_tol=0.01)
    assert isclose(evals[1],
                   spark_model.master_network.evaluate(x_test, y_test)[1],
                   abs_tol=0.01)
예제 #2
0
def main():
    gps_files = glob.glob('../data/prototype/**/gps_points.csv')
    trip_files = glob.glob('../data/prototype/**/gps_trips.csv')

    file_results = process_file(trip_file = trip_files[0], gps_file = gps_files[0])
    seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips'])

    X = seq_results['x']
    y = seq_results['y']

    print('Bulding training data from files..')
    for i in range(1, len(gps_files)):
        file_results = process_file(trip_file = trip_files[i], gps_file = gps_files[i])
        seq_results = build_seq(input_df = file_results['df'], unique_trips = file_results['unique_trips'])

        X = np.vstack((X, seq_results['x']))
        y = np.vstack((y, seq_results['y']))

    x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1, train_size=0.8)

    rdd = to_simple_rdd(sc, x_train, y_train)

    model = build_model()

    spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')

    spark_model.fit(rdd, epochs=5, batch_size=32, verbose=0, validation_split=0.1)
#    model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val))

    y_pred = spark_model.predict(x_val)

    acc = sum(np.argmax(y_pred, axis=1) == np.argmax(y_val, axis=1)) / y_pred.shape[0]

    print("Validation Accuracy: {number:.{digits}f}%".format(number=(acc*100), digits=2))
예제 #3
0
def test_training_regression(spark_context, mode, parameter_server_mode,
                             boston_housing_dataset, regression_model):
    x_train, y_train, x_test, y_test = boston_housing_dataset
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    # Define basic parameters
    batch_size = 64
    epochs = 10
    sgd = SGD(lr=0.0000001)
    regression_model.compile(sgd, 'mse', ['mae'])
    spark_model = SparkModel(regression_model,
                             frequency='epoch',
                             mode=mode,
                             parameter_server_mode=parameter_server_mode,
                             port=4000 + random.randint(0, 500))

    # Train Spark model
    spark_model.fit(rdd,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=0,
                    validation_split=0.1)

    # run inference on trained spark model
    predictions = spark_model.predict(x_test)
    # run evaluation on trained spark model
    evals = spark_model.evaluate(x_test, y_test)

    # assert we can supply rdd and get same prediction results when supplying numpy array
    test_rdd = spark_context.parallelize(x_test)
    assert all(
        np.isclose(x, y, 0.01)
        for x, y in zip(predictions, spark_model.predict(test_rdd)))

    # assert we get the same prediction result with calling predict on keras model directly
    assert all(
        np.isclose(x, y, 0.01) for x, y in zip(
            predictions, spark_model.master_network.predict(x_test)))

    # assert we get the same evaluation results when calling evaluate on keras model directly
    assert isclose(evals[0],
                   spark_model.master_network.evaluate(x_test, y_test)[0],
                   abs_tol=0.01)
    assert isclose(evals[1],
                   spark_model.master_network.evaluate(x_test, y_test)[1],
                   abs_tol=0.01)
예제 #4
0
def test_training_custom_activation(mode, spark_context):
    def custom_activation(x):
        return sigmoid(x) + 1

    model = Sequential()
    model.add(Dense(1, input_dim=1, activation=custom_activation))
    model.add(Dense(1, activation='sigmoid'))

    sgd = SGD(lr=0.1)
    model.compile(sgd, 'binary_crossentropy', ['acc'])

    x_train = np.random.rand(1000)
    y_train = np.zeros(1000)
    x_test = np.random.rand(100)
    y_test = np.zeros(100)
    y_train[:500] = 1
    rdd = to_simple_rdd(spark_context, x_train, y_train)

    spark_model = SparkModel(model, frequency='epoch', mode=mode,
                             custom_objects={'custom_activation': custom_activation})
    spark_model.fit(rdd, epochs=1, batch_size=16, verbose=0, validation_split=0.1)
    assert spark_model.predict(x_test)
    assert spark_model.evaluate(x_test, y_test)
예제 #5
0
파일: HAN.py 프로젝트: sd12832/HAN
class HAN(object):
    """
    HAN model is implemented here.
    """
    def __init__(self,
                 text,
                 labels,
                 pretrained_embedded_vector_path,
                 max_features,
                 max_senten_len,
                 max_senten_num,
                 embedding_size,
                 num_categories=None,
                 validation_split=0.2,
                 verbose=0):
        """Initialize the HAN module
        Keyword arguments:
        text -- list of the articles for training.
        labels -- labels corresponding the given `text`.
        pretrained_embedded_vector_path -- path of any pretrained vector
        max_features -- max features embeddeding matrix can have. To more checkout https://keras.io/layers/embeddings/
        max_senten_len -- maximum sentence length. It is recommended not to use the maximum one but the one that covers 0.95 quatile of the data.
        max_senten_num -- maximum number of sentences. It is recommended not to use the maximum one but the one that covers 0.95 quatile of the data.
        embedding_size -- size of the embedding vector
        num_categories -- total number of categories.
        validation_split -- train-test split. 
        verbose -- how much you want to see.
        """
        try:
            self.verbose = verbose
            self.max_features = max_features
            self.max_senten_len = max_senten_len
            self.max_senten_num = max_senten_num
            self.embed_size = embedding_size
            self.validation_split = validation_split
            self.embedded_dir = pretrained_embedded_vector_path
            self.text = pd.Series(text)
            self.categories = pd.Series(labels)
            self.classes = self.categories.unique().tolist()
            # Initialize default hyperparameters
            # You can change it using `set_hyperparameters` function
            self.hyperparameters = {
                'l2_regulizer': None,
                'dropout_regulizer': None,
                'rnn': LSTM,
                'rnn_units': 150,
                'dense_units': 200,
                'activation': 'softmax',
                'optimizer': 'adam',
                'metrics': ['acc'],
                'loss': 'categorical_crossentropy'
            }
            if num_categories is not None:
                assert (num_categories == len(self.classes))
            assert (self.text.shape[0] == self.categories.shape[0])
            self.data, self.labels = self.preprocessing()
            self.x_train, self.y_train, self.x_val, self.y_val = self.split_dataset(
            )
            self.embedding_index = self.add_glove_model()
            self.set_model()
        except AssertionError:
            print('Input and label data must be of same size')

        # Implement this after you have seen all the different kinds of errors
        # try:
        #     conf = SparkConf().setAppName('HANMusicClassifier').setMaster('')
        #     self.sc = SparkContext(conf=conf)
        # except Error:
        conf = SparkConf().setAppName('HANMusicClassifier')
        self.sc = SparkContext(conf=conf)

    def set_hyperparameters(self, tweaked_instances):
        """Set hyperparameters of HAN model.
        Keywords arguemnts:
        tweaked_instances -- dictionary of all those keys you want to change
        """
        for key, value in tweaked_instances.items():
            if key in self.hyperparameters:
                self.hyperparameters[key] = value
            else:
                raise KeyError(key + ' does not exist in hyperparameters')
            self.set_model()

    def show_hyperparameters(self):
        """To check the values of all the current hyperparameters
        """
        print('Hyperparameter\tCorresponding Value')
        for key, value in self.hyperparameters.items():
            print(key, '\t\t', value)

    def clean_string(self, string):
        """
        Tokenization/string cleaning for dataset
        Every dataset is lower cased except
        """
        string = re.sub(r"\\", "", string)
        string = re.sub(r"\'", "", string)
        string = re.sub(r"\"", "", string)
        return string.strip().lower()

    def add_dataset(self, text, labels):
        try:
            self.text = pd.concat([self.text, pd.Series(text)])
            self.categories = pd.concat([self.categories, pd.Series(labels)])
            assert (len(self.classes) == self.categories.unique().tolist())
        except AssertionError:
            print("New class cannot be added in this manner")

    def preprocessing(self):
        """Preprocessing of the text to make it more resonant for training
        """
        paras = []
        labels = []
        texts = []
        for idx in range(self.text.shape[0]):
            text = self.clean_string(self.text[idx])
            texts.append(text)
            sentences = tokenize.sent_tokenize(text)
            paras.append(sentences)
        tokenizer = Tokenizer(num_words=self.max_features, oov_token=True)
        tokenizer.fit_on_texts(texts)
        data = np.zeros((len(texts), self.max_senten_num, self.max_senten_len),
                        dtype='int32')
        for i, sentences in enumerate(paras):
            for j, sent in enumerate(sentences):
                if j < self.max_senten_num:
                    wordTokens = text_to_word_sequence(sent)
                    k = 0
                    for _, word in enumerate(wordTokens):
                        if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[
                                word] < self.max_features:
                            data[i, j, k] = tokenizer.word_index[word]
                            k = k + 1
        self.word_index = tokenizer.word_index
        if self.verbose == 1:
            print('Total %s unique tokens.' % len(self.word_index))
        labels = pd.get_dummies(self.categories)
        if self.verbose == 1:
            print('Shape of data tensor:', data.shape)
            print('Shape of labels tensor:', labels.shape)
        assert (len(self.classes) == labels.shape[1])
        assert (data.shape[0] == labels.shape[0])
        return data, labels

    def split_dataset(self):
        indices = np.arange(self.data.shape[0])
        np.random.shuffle(indices)
        self.data = self.data[indices]
        self.labels = self.labels.iloc[indices]
        nb_validation_samples = int(self.validation_split * self.data.shape[0])

        x_train = self.data[:-nb_validation_samples]
        y_train = self.labels[:-nb_validation_samples]
        x_val = self.data[-nb_validation_samples:]
        y_val = self.labels[-nb_validation_samples:]
        if self.verbose == 1:
            print(
                'Number of positive and negative reviews in traing and validation set'
            )
            print(y_train.columns.tolist())
            print(y_train.sum(axis=0).tolist())
            print(y_val.sum(axis=0).tolist())
        return x_train, y_train, x_val, y_val

    def get_model(self):
        """
        Returns the HAN model so that it can be used as a part of pipeline
        """
        return self.model

    def add_glove_model(self):
        """
        Read and save Pretrained Embedding model
        """
        embeddings_index = {}
        try:
            f = open(self.embedded_dir)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                assert (coefs.shape[0] == self.embed_size)
                embeddings_index[word] = coefs
            f.close()
        except OSError:
            print('Embedded file does not found')
            exit()
        except AssertionError:
            print(
                "Embedding vector size does not match with given embedded size"
            )
        return embeddings_index

    def get_embedding_matrix(self):
        """
        Returns Embedding matrix
        """
        embedding_matrix = np.random.random(
            (len(self.word_index) + 1, self.embed_size))
        absent_words = 0
        for word, i in self.word_index.items():
            embedding_vector = self.embedding_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
            else:
                absent_words += 1
        if self.verbose == 1:
            print('Total absent words are', absent_words, 'which is',
                  "%0.2f" % (absent_words * 100 / len(self.word_index)),
                  '% of total words')
        return embedding_matrix

    def get_embedding_layer(self):
        """
        Returns Embedding layer
        """
        embedding_matrix = self.get_embedding_matrix()
        return Embedding(len(self.word_index) + 1,
                         self.embed_size,
                         weights=[embedding_matrix],
                         input_length=self.max_senten_len,
                         trainable=False)

    def set_model(self):
        """
        Set the HAN model according to the given hyperparameters
        """
        if self.hyperparameters['l2_regulizer'] is None:
            kernel_regularizer = None
        else:
            kernel_regularizer = regularizers.l2(
                self.hyperparameters['l2_regulizer'])
        if self.hyperparameters['dropout_regulizer'] is None:
            dropout_regularizer = 1
        else:
            dropout_regularizer = self.hyperparameters['dropout_regulizer']
        word_input = Input(shape=(self.max_senten_len, ), dtype='float32')
        word_sequences = self.get_embedding_layer()(word_input)
        word_lstm = Bidirectional(self.hyperparameters['rnn'](
            self.hyperparameters['rnn_units'],
            return_sequences=True,
            kernel_regularizer=kernel_regularizer))(word_sequences)
        word_dense = TimeDistributed(
            Dense(self.hyperparameters['dense_units'],
                  kernel_regularizer=kernel_regularizer))(word_lstm)
        word_att = AttentionWithContext()(word_dense)
        wordEncoder = Model(word_input, word_att)

        sent_input = Input(shape=(self.max_senten_num, self.max_senten_len),
                           dtype='float32')
        sent_encoder = TimeDistributed(wordEncoder)(sent_input)
        sent_lstm = Bidirectional(self.hyperparameters['rnn'](
            self.hyperparameters['rnn_units'],
            return_sequences=True,
            kernel_regularizer=kernel_regularizer))(sent_encoder)
        sent_dense = TimeDistributed(
            Dense(self.hyperparameters['dense_units'],
                  kernel_regularizer=kernel_regularizer))(sent_lstm)
        sent_att = Dropout(dropout_regularizer)(
            AttentionWithContext()(sent_dense))
        preds = Dense(len(self.classes))(sent_att)
        self.model = Model(sent_input, preds)
        self.model.compile(loss=self.hyperparameters['loss'],
                           optimizer=self.hyperparameters['optimizer'],
                           metrics=self.hyperparameters['metrics'])
        self.spark_model = SparkModel(self.model,
                                      frequency='epoch',
                                      mode='asynchronous')

    # Currently cannot plot learning curve
    def train_model(self,
                    rdd,
                    epochs,
                    batch_size,
                    verbose=1,
                    validation_split=0.1):
        """Training the model
        rdd  -- The actual data
        epochs -- Total number of epochs
        batch_size -- size of a batch
        verbose -- Whether or not we want verbose feedback
        validation_split -- What percentage of the data from the rdd is actually used as a validation set
        """

        self.spark_model.fit(self,
                             rdd,
                             epochs=epochs,
                             batch_size=batch_size,
                             verbose=verbose,
                             validation_split=validation_split)

    def predict(self, rdd):
        self.spark_model.predict(rdd)

    def plot_results(self):
        """
        Plotting learning curve of last trained model. 
        """
        # summarize history for accuracy
        plt.subplot(211)
        plt.plot(self.history.history['acc'])
        plt.plot(self.history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')

        # summarize history for loss
        plt.subplot(212)
        plt.plot(self.history.history['val_loss'])
        plt.plot(self.history.history['loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        time.sleep(10)
        plt.close()
예제 #6
0
spark_model.fit(rdd, epochs=50, batch_size=500, validation_split=0.01)
LOGGER.info('Spark model trained')
LOGGER.info(model.summary())
test_data = scaled_data[training_data_len - 30:, :]
x_test = []
y_test = dataset[training_data_len:, :]

y_test = dataset[training_data_len:, :]
for i in range(30, len(test_data)):
    x_test.append(test_data[i - 30:i, 0])

x_test = np.array(x_test)

x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

predictions = spark_model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

rmse = np.sqrt(np.mean(((predictions - y_test)**2)))
rmse

train = data_pd[:training_data_len]
valid = data_pd[training_data_len:]
valid['Predictions'] = predictions
plt.figure(figsize=(16, 8))
plt.title('Model prediction on EUR/HUF')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price EUR/HUF', fontsize=18)
plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
예제 #7
0
model = Sequential()
model.add(Dense(18, input_dim=26))
model.add(Activation('sigmoid'))
model.add(Dense(6))
model.add(Activation('sigmoid'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

spark = SparkSession.builder.appName('ElephasTest').getOrCreate()
rdd = to_simple_rdd(spark.sparkContext, train, y_train)

sgd = SGD(lr=0.1)
adagrad = elephas_optimizers.Adagrad()
spark_model = SparkModel(spark.sparkContext,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         master_loss='mse',
                         num_workers=2, master_optimizer=sgd)

# Train Spark model
spark_model.train(rdd, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2, validation_split=0.1)

# Evaluate Spark model by evaluating the underlying model
score = spark_model.master_network.evaluate(test.values, y_test, verbose=2)
print('Test accuracy:', score[1])
print spark_model.predict(test.values)
print y_test
#---(i.e. in training each worker will train on part of the data)
rdd = to_simple_rdd(sc, X_train, y_train)

#---Initialize SparkModel from Keras model and Spark context
#---there are two optimizers needed:
sgd = SGD(lr=0.1)  #<---the master optimizer
adagrad = elephas_optimizers.Adagrad()  #<---the elephas opimizer
spark_model = SparkModel(sc,
                         model,
                         optimizer=adagrad,
                         frequency='epoch',
                         mode='asynchronous',
                         num_workers=args.N_workers,
                         master_optimizer=sgd)

#---Train Spark model
spark_model.train(rdd,
                  nb_epoch=args.nb_epoch,
                  batch_size=args.batch_size,
                  verbose=1,
                  validation_split=0.25)

#---Evaluate Spark model by evaluating the underlying Keras master model
pred = spark_model.predict(X_test)
print np.shape(pred)
print np.shape(y_test)
acc = accuracy_score([np.argmax(y) for y in y_test],
                     [np.argmax(p) for p in pred])
print "--->test accuracy: ", acc
print "--->number of workers: ", args.N_workers
print "--->time: ", time.time() - start_time
class KerasNeuralNetworkSpark(object):
    def __init__(self, layers, spark, batch_size=64, epoch=10, num_workers=2, predictionCol='prediction',
                 labelCol='target', featuresCol='feature'):
        self._batch_size = batch_size
        self._epoch = epoch
        self._model = None
        self._spark = spark
        self._labels = labelCol
        self._features = featuresCol
        self._prediction = predictionCol
        self._layers = layers
        self._worker_num = num_workers
        self._build_model()

    def _build_model(self):
        model = Sequential()
        adam = elephas_optimizers.Adam()
        layers = self._layers
        model.add(Dense(layers[1], input_dim=layers[0], init='normal', activation='relu'))
        for i in range(2, len(layers) - 1):
            model.add(Dense(layers[i], activation='relu'))

        model.add(Dense(layers[-1], activation='sigmoid'))
        self._model = SparkModel(self._spark.sparkContext, model,
                                 optimizer=adam,
                                 frequency='epoch',
                                 mode='asynchronous',
                                 master_loss='mse',
                                 num_workers=self._worker_num)

    def fit(self, df):
        if hasattr(self._model, 'server'):
            self._model.server.terminate()
        pdf = df.toPandas()

        rdd = to_simple_rdd(self._spark.sparkContext, pdf[self._features], pdf[self._labels])
        self._model.train(rdd, self._epoch, self._batch_size, 0, 0.1)

    def transform(self, df):
        pdf = df.toPandas()
        # df.write.save('test_df.parquet')
        pnparray = pdf[self._features].values
        container = np.zeros((pnparray.shape[0], len(pnparray[0])))
        for i in range(pnparray.shape[0]):
            container[i, :] = pnparray[i][:]
        result = self._model.predict(container)

        pdf[self._prediction] = result

        # import pickle
        # with open('ann_result.p', 'w') as f:
        #     pickle.dump(result, f)

        # result_df = pd.DataFrame(pdf
        new_df = self._spark.createDataFrame(pdf)
        # df.join(new_df)
        return new_df

    def stop_server(self):
        if hasattr(self._model, 'server') and hasattr(self._model.server, 'terminate'):
            self._model.server.terminate()