def read_rdd(esConfig, esResource=None, filter=None, esQuery=None): """ Read the data from elastic search into Spark RDD. :param esConfig: Dictionary which represents configuration for elastic search(eg. ip, port, es query etc). :param esResource: Optional. resource file in elastic search. It also can be set in esConfig :param filter: Optional. Request only those fields from Elasticsearch :param esQuery: Optional. es query :return: Spark RDD """ sc = init_nncontext() if "es.resource" not in esConfig: esConfig["es.resource"] = esResource if filter is not None: esConfig["es.read.source.filter"] = filter if esQuery is not None: esConfig["es.query"] = esQuery rdd = sc.newAPIHadoopRDD( "org.elasticsearch.hadoop.mr.EsInputFormat", "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=esConfig) return rdd
def partition(data, num_shards=None): """ Partition local in memory data and form a SparkXShards :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure made of tuple, list, dict with ndarray as the leaf value :param num_shards: the number of shards that the data will be partitioned into :return: a SparkXShards """ sc = init_nncontext() node_num, core_num = get_node_and_core_number() shard_num = node_num * core_num if num_shards is None else num_shards import numpy as np type_err_msg = """ The types supported in zoo.orca.data.XShards.partition are 1. np.ndarray 2. a tuple, list, dict of np.ndarray 3. nested structure made of tuple, list, dict with ndarray as the leaf value But got data of type {} """.format(type(data)) supported_types = {list, tuple, dict} if isinstance(data, np.ndarray): if data.shape[0] < shard_num: raise ValueError( "The length of data {} is smaller than the total number " "of shards {}. Please adjust the num_shards option to be " "at most {}.".format(data.shape[0], shard_num, data.shape[0])) arrays = np.array_split(data, shard_num) rdd = sc.parallelize(arrays) else: assert type(data) in supported_types, type_err_msg flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_shard = [] if data_length < shard_num: raise ValueError( "The length of data {} is smaller than the total number " "of shards {}. Please adjust the num_shards option to be " "at most {}.".format(data_length, shard_num, data_length)) for i in range(shard_num): data_to_be_shard.append([]) for x in flattened: assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension, " \ "got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, shard_num) for idx, x_part in enumerate(x_parts): data_to_be_shard[idx].append(x_part) data_to_be_shard = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_shard ] rdd = sc.parallelize(data_to_be_shard) data_shards = SparkXShards(rdd) return data_shards
def load_pickle(cls, path, minPartitions=None): """ Load XShards from pickle files. :param path: The pickle file path/directory :param minPartitions: The minimum partitions for the XShards :return: SparkXShards object """ sc = init_nncontext() return SparkXShards(sc.pickleFile(path, minPartitions))
def partition(data): """ Partition local in memory data and form a SparkXShards :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure made of tuple, list, dict with ndarray as the leaf value :return: a SparkXShards """ sc = init_nncontext() node_num, core_num = get_node_and_core_number() total_core_num = node_num * core_num import numpy as np type_err_msg = """ The types supported in zoo.orca.data.XShards.partition are 1. np.ndarray 2. a tuple, list, dict of np.ndarray 3. nested structure made of tuple, list, dict with ndarray as the leaf value But got data of type {} """.format(type(data)) supported_types = {list, tuple, dict} if isinstance(data, np.ndarray): arrays = np.array_split(data, total_core_num) rdd = sc.parallelize(arrays) else: assert type(data) in supported_types, type_err_msg flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_shard = [] for i in range(total_core_num): data_to_be_shard.append([]) for x in flattened: assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension, " \ "got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, total_core_num) for idx, x_part in enumerate(x_parts): data_to_be_shard[idx].append(x_part) data_to_be_shard = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_shard ] rdd = sc.parallelize(data_to_be_shard) data_shards = SparkXShards(rdd) return data_shards
def predict(model_path, img_path): model = InferenceModel() model.load_openvino(model_path, weight_path=model_path[:model_path.rindex(".")] + ".bin", batch_size=BATCH_SIZE) sc = init_nncontext("OpenVINO Python resnet_v1_50 Inference Example") # pre-processing infer_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageCenterCrop(224, 224), ImageMatToTensor(format="NHWC", to_RGB=True) ]) image_set = ImageSet.read(img_path, sc).\ transform(infer_transformer).get_image().collect() image_set = np.expand_dims(image_set, axis=1) for i in range(len(image_set) // BATCH_SIZE + 1): index = i * BATCH_SIZE # check whether out of index if index >= len(image_set): break batch = image_set[index] # put 4 images in one batch for j in range(index + 1, min(index + BATCH_SIZE, len(image_set))): batch = np.vstack((batch, image_set[j])) batch = np.expand_dims(batch, axis=0) # predict batch predictions = model.predict(batch) result = predictions[0] # post-processing for Top-1 print("batch_" + str(i)) for r in result: output = {} max_index = np.argmax(r) output["Top-1"] = str(max_index) print("* Predict result " + str(output)) print("finished...") sc.stop()
def read_df(esConfig, esResource, schema=None): """ Read the data from elastic search into DataFrame. :param esConfig Dictionary which represents configuration for elastic search(eg. ip, port etc). :param esResource resource file in elastic search. :param schema Optional. Defines the schema of Spark dataframe. If each column in Es is single value, don't need set schema. :return Spark DataFrame. Each row represents a document in ES. """ sc = init_nncontext() sqlContext = SQLContext.getOrCreate(sc) spark = sqlContext.sparkSession reader = spark.read_df.format("org.elasticsearch.spark.sql") for key in esConfig: reader.option(key, esConfig[key]) if schema: reader.schema(schema) df = reader.load(esResource) return df
def predict(model_path, image_path, top_n): sc = init_nncontext( "Image classification inference example using int8 quantized model") images = ImageSet.read(image_path, sc, image_codec=1) model = ImageClassifier.load_model(model_path) output = model.predict_image_set(images) label_map = model.get_config().label_map() # list of images composing uri and results in tuple format predicts = output.get_predict().collect() sequential = Sequential() sequential.add(Activation("softmax", input_shape=predicts[0][1][0].shape)) for pre in predicts: (uri, probs) = pre out = sequential.forward(probs[0]) sortedProbs = [(prob, index) for index, prob in enumerate(out)] sortedProbs.sort() print("Image : %s, top %d prediction result" % (uri, top_n)) for i in range(top_n): print( "\t%s, %f" % (label_map[sortedProbs[999 - i][1]], sortedProbs[999 - i][0]))
# limitations under the License. # from bigdl.optim.optimizer import Adam from keras.datasets import imdb from keras.preprocessing import sequence from zoo.pipeline.api.keras.models import Model from zoo.pipeline.api.keras.layers import * from zoo.pipeline.api.autograd import * from zoo.common.nncontext import init_spark_conf from zoo.common.nncontext import init_nncontext conf = init_spark_conf() conf.set("spark.executor.extraJavaOptions", "-Xss512m") conf.set("spark.driver.extraJavaOptions", "-Xss512m") sc = init_nncontext(conf) max_features = 20000 max_len = 200 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) xmb = np.zeros((len(x_train), max_len, 2), dtype=np.int32)
from zoo.pipeline.api.keras.layers import * from zoo.models.recommendation import UserItemFeature from zoo.models.recommendation import NeuralCF from zoo.common.nncontext import init_nncontext import matplotlib from sklearn import metrics from operator import itemgetter from bigdl.dataset import movielens from bigdl.util.common import * sc = init_nncontext("NCF Example") movielens_data = movielens.get_id_ratings("/tmp/movielens/") min_user_id = np.min(movielens_data[:,0]) max_user_id = np.max(movielens_data[:,0]) min_movie_id = np.min(movielens_data[:,1]) max_movie_id = np.max(movielens_data[:,1]) rating_labels= np.unique(movielens_data[:,2]) print(movielens_data.shape) print(min_user_id, max_user_id, min_movie_id, max_movie_id, rating_labels) def build_sample(user_id, item_id, rating): sample = Sample.from_ndarray(np.array([user_id, item_id]), np.array([rating])) return UserItemFeature(user_id, item_id, sample) pairFeatureRdds = sc.parallelize(movielens_data)\ .map(lambda x: build_sample(x[0], x[1], x[2]-1)) pairFeatureRdds.take(3) trainPairFeatureRdds, valPairFeatureRdds = pairFeatureRdds.randomSplit([0.8, 0.2], seed= 1) valPairFeatureRdds.cache() train_rdd= trainPairFeatureRdds.map(lambda pair_feature: pair_feature.sample) val_rdd= valPairFeatureRdds.map(lambda pair_feature: pair_feature.sample)
sortedProbs.sort() print("Image : %s, top %d prediction result" % (uri, topN)) for i in range(topN): print("\t%s, %f" % (labelMap[sortedProbs[999 - i][1]], sortedProbs[999 - i][0])) if __name__ == "__main__": parser = OptionParser() parser.add_option("-f", "--folder", type=str, dest="img_path", default=".", help="Path where the images are stored") parser.add_option("--model", type=str, dest="model_path", default="", help="Path where the model is stored") parser.add_option("--topN", type=int, dest="topN", default=1, help="top N number") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("Image Classification Example") predict(options.model_path, options.img_path, options.topN)
def predict(img_path): val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(img_path, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])), batch_size=8, shuffle=False, num_workers=1, pin_memory=True) model = models.resnet18(pretrained=True).eval() net = TorchNet.from_pytorch(model, [1, 3, 224, 224]) for inputs, labels in val_loader: output = net.predict(inputs.numpy(), distributed=True).collect() index = [o.argmax() for o in output] print(index) if __name__ == "__main__": parser = OptionParser() parser.add_option("--image", type=str, dest="img_path", help="The path where the images are stored, " "can be either a folder or an image path") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("Torch ResNet Prediction Example") predict(options.img_path)
def __init__(self, data): sc = init_nncontext() self.broadcast_data = sc.broadcast(data) self._value = None
parser.add_option("-l", "--learning_rate", dest="learning_rate", default="0.01") parser.add_option("--log_dir", dest="log_dir", default="/tmp/.bigdl") parser.add_option("--model", dest="model") (options, args) = parser.parse_args(sys.argv) data_path = options.data_path token_length = int(options.token_length) sequence_len = int(options.sequence_length) max_words_num = int(options.max_words_num) training_split = float(options.training_split) batch_size = int(options.batch_size) sc = init_nncontext( create_spark_conf().setAppName("Text Classification Example")) print('Processing text dataset...') texts = get_news20(base_dir=data_path) text_data_rdd = sc.parallelize(texts, options.partition_num) word_meta = analyze_texts(text_data_rdd) # Remove the top 10 words roughly. You might want to fine tune this. word_meta = dict(word_meta[10:max_words_num]) word_mata_broadcast = sc.broadcast(word_meta) word2vec = get_glove(base_dir=data_path, dim=token_length) # Ignore those unknown words. filtered_word2vec = dict( (w, v) for w, v in word2vec.items() if w in word_meta) filtered_word2vec_broadcast = sc.broadcast(filtered_word2vec)
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import argparse import cv2 from zoo.common.nncontext import init_nncontext from zoo.models.image.objectdetection import * sc = init_nncontext("Object Detection Example") parser = argparse.ArgumentParser() parser.add_argument('model_path', help="Path where the model is stored") parser.add_argument('img_path', help="Path where the images are stored") parser.add_argument('output_path', help="Path to store the detection results") parser.add_argument("--partition_num", type=int, default=1, help="The number of partitions") def predict(model_path, img_path, output_path, partition_num): model = ObjectDetector.load_model(model_path) image_set = ImageSet.read(img_path, sc,
if __name__ == "__main__": parser = OptionParser() parser.add_option("--data_path", dest="data_path") parser.add_option("--embedding_file", dest="embedding_file") parser.add_option("--question_length", dest="question_length", default="10") parser.add_option("--answer_length", dest="answer_length", default="40") parser.add_option("--partition_num", dest="partition_num", default="4") parser.add_option("-b", "--batch_size", dest="batch_size", default="200") parser.add_option("-e", "--nb_epoch", dest="nb_epoch", default="30") parser.add_option("-l", "--learning_rate", dest="learning_rate", default="0.001") parser.add_option("-m", "--model", dest="model") parser.add_option("--output_path", dest="output_path") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("QARanker Example") q_set = TextSet.read_csv(options.data_path + "/question_corpus.csv", sc, int(options.partition_num)).tokenize().normalize()\ .word2idx(min_freq=2).shape_sequence(int(options.question_length)) a_set = TextSet.read_csv(options.data_path+"/answer_corpus.csv", sc, int(options.partition_num)).tokenize().normalize()\ .word2idx(min_freq=2, existing_map=q_set.get_word_index())\ .shape_sequence(int(options.answer_length)) train_relations = Relations.read(options.data_path + "/relation_train.csv", sc, int(options.partition_num)) train_set = TextSet.from_relation_pairs(train_relations, q_set, a_set) validate_relations = Relations.read(options.data_path + "/relation_valid.csv", sc, int(options.partition_num)) validate_set = TextSet.from_relation_lists(validate_relations, q_set, a_set)
def predict(self, data, feature_cols=None, batch_size=4): """ Predict input data :param batch_size: Int. Set batch Size, default is 4. :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy arrays are supported. If data is XShards, each partition is a dictionary of {'x': feature}, where feature(label) is a numpy array or a list of numpy arrays. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. If the input data is XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy array or a list of numpy arrays. """ sc = init_nncontext() model_bytes_broadcast = sc.broadcast(self.model_bytes) weight_bytes_broadcast = sc.broadcast(self.weight_bytes) def partition_inference(partition): model_bytes = model_bytes_broadcast.value weight_bytes = weight_bytes_broadcast.value partition = list(partition) data_num = len(partition) ie = IECore() config = {'CPU_THREADS_NUM': str(self.core_num)} ie.set_config(config, 'CPU') net = ie.read_network(model=model_bytes, weights=weight_bytes, init_from_buffer=True) net.batch_size = batch_size local_model = ie.load_network(network=net, device_name="CPU", num_requests=data_num) inputs = list(iter(local_model.requests[0].input_blobs)) outputs = list(iter(local_model.requests[0].output_blobs)) assert len( outputs) != 0, "The number of model outputs should not be 0." def add_elem(d): d_len = len(d) if d_len < batch_size: rep_time = [1] * (d_len - 1) rep_time.append(batch_size - d_len + 1) return np.repeat(d, rep_time, axis=0), d_len else: return d, d_len results = [] for idx, batch_data in enumerate(partition): infer_request = local_model.requests[idx] input_dict = dict() elem_num = 0 if isinstance(batch_data, list): for i, input in enumerate(inputs): input_dict[input], elem_num = add_elem(batch_data[i]) else: input_dict[inputs[0]], elem_num = add_elem(batch_data) infer_request.infer(input_dict) if len(outputs) == 1: results.append(infer_request.output_blobs[ outputs[0]].buffer[:elem_num]) else: results.append( list( map( lambda output: infer_request.output_blobs[ output].buffer[:elem_num], outputs))) return results def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[0] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return feature_data if isinstance(data, DataFrame): from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe xshards, _ = dataframe_to_xshards(data, validation_data=None, feature_cols=feature_cols, label_cols=None, mode="predict") transformed_data = xshards.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) return convert_predict_rdd_to_dataframe( data, result_rdd.flatMap(lambda data: data)) elif isinstance(data, SparkXShards): transformed_data = data.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) def update_result_shard(data): shard, y = data shard["prediction"] = y return shard return SparkXShards( data.rdd.zip(result_rdd).map(update_result_shard)) elif isinstance(data, (np.ndarray, list)): if isinstance(data, np.ndarray): split_num = math.ceil(len(data) / batch_size) arrays = np.array_split(data, split_num) num_slices = min(split_num, self.node_num) data_rdd = sc.parallelize(arrays, numSlices=num_slices) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = math.ceil(flattened[0].shape[0] / batch_size) num_slices = min(split_num, self.node_num) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices) print("Partition number: ", data_rdd.getNumPartitions()) result_rdd = data_rdd.mapPartitions( lambda iter: partition_inference(iter)) result_arr_list = result_rdd.collect() result_arr = None if isinstance(result_arr_list[0], list): result_arr = [ np.concatenate([r[i] for r in result_arr_list], axis=0) for i in range(len(result_arr_list[0])) ] elif isinstance(result_arr_list[0], np.ndarray): result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr" "ays are supported as input data, but get " + data.__class__.__name__)
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import argparse import cv2 from zoo.common.nncontext import init_nncontext from zoo.models.image.objectdetection import * sc = init_nncontext(create_spark_conf().setAppName("Object Detection Example")) parser = argparse.ArgumentParser() parser.add_argument('model_path', help="Path where the model is stored") parser.add_argument('img_path', help="Path where the images are stored") parser.add_argument('output_path', help="Path to store the detection results") def predict(model_path, img_path, output_path): model = ObjectDetector.load_model(model_path) image_set = ImageSet.read(img_path, sc) output = model.predict_image_set(image_set) config = model.get_config() visualizer = Visualizer(config.label_map(), encoding="jpg") visualized = visualizer(output).get_image(to_chw=False).collect()
df['hours'] = df['datetime'].dt.hour df['awake'] = (((df['hours'] >= awake_begin) & (df['hours'] <= awake_end)) | (df['hours'] == 0)).astype(int) return df if __name__ == "__main__": parser = OptionParser() parser.add_option("-f", type=str, dest="file_path", help="The file path to be read") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext() # read data file_path = options.file_path data_shard = zoo.orca.data.pandas.read_csv(file_path, sc) data = data_shard.collect() # repartition data_shard = data_shard.repartition(2) # apply function on each element trans_data_shard = data_shard.transform_shard(process_feature) data2 = trans_data_shard.collect() sc.stop()
def load_pickle(cls, path, minPartitions=None): sc = init_nncontext() return SparkXShards(sc.pickleFile(path, minPartitions))
from zoo.models.recommendation import NeuralCF from zoo.common.nncontext import init_nncontext import matplotlib from sklearn import metrics from operator import itemgetter from bigdl.nn.criterion import * from bigdl.optim.optimizer import * from bigdl.dataset import movielens from bigdl.util.common import * matplotlib.use('agg') import matplotlib.pyplot as plt %pylab inline # Initilaize NN context, it will get a SparkContext with optimized configuration for BigDL performance. sc = init_nncontext("NCF Example") # Data Preparation # Download and read movielens 1M data movielens_data = movielens.get_id_ratings("hdfs:///user/leelau/zoo/recommendation-ncf/*") # Understand the data. Each record is in format of (userid, movieid, rating_score). UserIDs range between 1 and 6040. MovieIDs range between 1 and 3952. Ratings are made on a 5-star scale (whole-star ratings only). Counts of users and movies are recorded for later use. min_user_id = np.min(movielens_data[:,0]) max_user_id = np.max(movielens_data[:,0]) min_movie_id = np.min(movielens_data[:,1]) max_movie_id = np.max(movielens_data[:,1]) rating_labels= np.unique(movielens_data[:,2]) print(movielens_data.shape) print(min_user_id, max_user_id, min_movie_id, max_movie_id, rating_labels)
from zoo.models.anomalydetection import AnomalyDetector import pandas as pd from pyspark.sql import SQLContext from pyspark import sql from optparse import OptionParser import sys if __name__ == "__main__": parser = OptionParser() parser.add_option("--input_dir", dest="input_dir") parser.add_option("-b", "--batch_size", dest="batch_size", default="1024") parser.add_option("--nb_epoch", dest="nb_epoch", default="20") parser.add_option("--unroll_len", dest="unroll_len", default="24") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("Anomaly Detection Example") sqlContext = sql.SQLContext(sc) def load_and_scale(input_path): df = pd.read_csv(input_path) df['datetime'] = pd.to_datetime(df['timestamp']) df['hours'] = df['datetime'].dt.hour df['awake'] = (((df['hours'] >= 6) & (df['hours'] <= 23)) | (df['hours'] == 0)).astype(int) print(df.info()) sqlContext = SQLContext(sc) dfspark = sqlContext.createDataFrame(df[["value", "hours", "awake"]]) feature_size = len(["value", "hours", "awake"]) return AnomalyDetector.standardScale(dfspark), feature_size
def predict(self, data, feature_cols=None): """ Predict input data :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy arrays are supported. If data is XShards, each partition is a dictionary of {'x': feature}, where feature(label) is a numpy array or a list of numpy arrays. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. If the input data is XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy array or a list of numpy arrays. """ from pyspark.sql import DataFrame def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[0] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return feature_data sc = init_nncontext() if isinstance(data, DataFrame): from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe xshards, _ = dataframe_to_xshards(data, validation_data=None, feature_cols=feature_cols, label_cols=None, mode="predict") transformed_data = xshards.transform_shard(predict_transform, self.batch_size) result_rdd = self.model.distributed_predict( transformed_data.rdd, sc) def delete_useless_result(data): shard, y = data data_length = len(shard["x"]) return y[:data_length] result_rdd = xshards.rdd.zip(result_rdd).map(delete_useless_result) return convert_predict_rdd_to_dataframe( data, result_rdd.flatMap(lambda data: data)) elif isinstance(data, SparkXShards): transformed_data = data.transform_shard(predict_transform, self.batch_size) result_rdd = self.model.distributed_predict( transformed_data.rdd, sc) def update_shard(data): shard, y = data data_length = len(shard["x"]) shard["prediction"] = y[:data_length] return shard return SparkXShards(data.rdd.zip(result_rdd).map(update_shard)) elif isinstance(data, (np.ndarray, list)): if isinstance(data, np.ndarray): split_num = math.ceil(len(data) / self.batch_size) arrays = np.array_split(data, split_num) data_length_list = list(map(lambda arr: len(arr), arrays)) data_rdd = sc.parallelize(arrays, numSlices=split_num) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = math.ceil(flattened[0].shape[0] / self.batch_size) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_length_list = list( map(lambda arr: len(arr), x_part)) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num) result_rdd = self.model.distributed_predict(data_rdd, sc) result_arr_list = result_rdd.collect() for i in range(0, len(result_arr_list)): result_arr_list[i] = result_arr_list[i][:data_length_list[i]] result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr" "ays are supported as input data, but get " + data.__class__.__name__)
dest="encoder_output_dim", default="256") parser.add_option("--training_split", dest="training_split", default="0.8") parser.add_option("-b", "--batch_size", dest="batch_size", default="128") parser.add_option("-e", "--nb_epoch", dest="nb_epoch", default="20") parser.add_option("-l", "--learning_rate", dest="learning_rate", default="0.01") parser.add_option("--log_dir", dest="log_dir", default="/tmp/.analytics-zoo") parser.add_option("-m", "--model", dest="model") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("Text Classification Example") text_set = TextSet.read(path=options.data_path).to_distributed( sc, int(options.partition_num)) print("Processing text dataset...") transformed = text_set.tokenize().normalize()\ .word2idx(remove_topN=10, max_words_num=int(options.max_words_num))\ .shape_sequence(len=int(options.sequence_length)).generate_sample() train_set, val_set = transformed.random_split( [float(options.training_split), 1 - float(options.training_split)]) if options.model: model = TextClassifier.load_model(options.model) else: token_length = int(options.token_length) if not (token_length == 50 or token_length == 100
def predict(self, data, **kwargs): def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[1] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[1] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return dict_data["x"] sc = init_nncontext() if isinstance(data, SparkXShards): assert sc is not None, "You should pass sc(spark context) if data is a XShards." from zoo.orca.learn.utils import convert_predict_to_xshard data = data.transform_shard(predict_transform, self.batch_size) result_rdd = self.model.distributed_predict(data.rdd, sc) return convert_predict_to_xshard(result_rdd) elif isinstance(data, (np.ndarray, list)): total_core_num = self.core_num * self.node_num if isinstance(data, np.ndarray): assert data.shape[1] <= self.batch_size, "The batch size of input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some " \ "inputs will be ignored." split_num = min(total_core_num, data.shape[0]) arrays = np.array_split(data, split_num) data_rdd = sc.parallelize(arrays, numSlices=split_num) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = min(total_core_num, flattened[0].shape[0]) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) assert x.shape[1] <= self.batch_size, "The batch size of each input data (" \ "the second dim) should be less than " \ "the model batch size, otherwise some " \ "inputs will be ignored." x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num) result_rdd = self.model.distributed_predict(data_rdd, sc) result_arr_list = result_rdd.collect() result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, a numpy array and a list of numpy arrays are supported " "as input data, but get " + data.__class__.__name__)
parser.add_option("--training_split", dest="training_split", default="0.8") parser.add_option("-b", "--batch_size", dest="batch_size", default="128") parser.add_option("--nb_epoch", dest="nb_epoch", default="20") parser.add_option("-l", "--learning_rate", dest="learning_rate", default="0.01") parser.add_option("--log_dir", dest="log_dir", default="/tmp/.bigdl") parser.add_option("--model", dest="model") (options, args) = parser.parse_args(sys.argv) data_path = options.data_path token_length = int(options.token_length) sequence_len = int(options.sequence_length) max_words_num = int(options.max_words_num) training_split = float(options.training_split) batch_size = int(options.batch_size) sc = init_nncontext(create_spark_conf().setAppName("Text Classification Example")) print('Processing text dataset...') texts = get_news20(base_dir=data_path) text_data_rdd = sc.parallelize(texts, options.partition_num) word_meta = analyze_texts(text_data_rdd) # Remove the top 10 words roughly. You might want to fine tune this. word_meta = dict(word_meta[10: max_words_num]) word_mata_broadcast = sc.broadcast(word_meta) word2vec = get_glove(base_dir=data_path, dim=token_length) # Ignore those unknown words. filtered_word2vec = dict((w, v) for w, v in word2vec.items() if w in word_meta) filtered_word2vec_broadcast = sc.broadcast(filtered_word2vec)
if __name__ == "__main__": parser = OptionParser() parser.add_option("--image", type=str, dest="img_path", help="The path where the images are stored, " "can be either a folder or an image path") parser.add_option("--model", type=str, dest="model_path", help="Path to the TensorFlow model file") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("OpenVINO Object Detection Inference Example") images = ImageSet.read(options.img_path, sc, resize_height=600, resize_width=600).get_image().collect() input_data = np.concatenate( [image.reshape((1, 1) + image.shape) for image in images], axis=0) model_path = options.model_path model = InferenceModel() model.load_openvino(model_path, weight_path=model_path[:model_path.rindex(".")] + ".bin") predictions = model.predict(input_data) # Print the detection result of the first image. print(predictions[0])
# Transpose TensorFlow NHWC format to Analytics Zoo NCHW format. model.add(Transpose([(2, 4), (2, 3)])) model.add(Contiguous()) model.add(detector) # Select the detection_boxes from the output. model.add(SelectTable(2)) image_set = ImageSet.read(img_path, sc, partition_num) transformer = ChainedPreprocessing([ImageResize(256, 256), ImageMatToTensor(), ImageSetToSample()]) transformed_image_set = image_set.transform(transformer) output = model.predict_image(transformed_image_set.to_image_frame(), batch_per_partition=1) # Print the detection box with the highest score of the first prediction result. result = output.get_predict().first() print(result[1][0]) if __name__ == "__main__": parser = OptionParser() parser.add_option("--image", type=str, dest="img_path", help="The path where the images are stored, " "can be either a folder or an image path") parser.add_option("--model", type=str, dest="model_path", help="The path of the TensorFlow object detection model") parser.add_option("--partition_num", type=int, dest="partition_num", default=4, help="The number of partitions") (options, args) = parser.parse_args(sys.argv) sc = init_nncontext("TFNet Object Detection Example") predict(options.model_path, options.img_path, options.partition_num)
from zoo.common.nncontext import init_nncontext from zoo.feature.image import * import cv2 import numpy as np from IPython.display import Image, display sc = init_nncontext("Image Augmentation Example") # create LocalImageSet from an image local_image_set = ImageSet.read("/home/cdsw/image-augmentation/image/test.jpg") # create LocalImageSet from an image folder local_image_set = ImageSet.read("/home/cdsw/image-augmentation/image") # create LocalImageSet from list of images image = cv2.imread("/home/cdsw/image-augmentation/image/test.jpg") local_image_set = LocalImageSet([image]) print(local_image_set.get_image()) print('isDistributed: ', local_image_set.is_distributed(), ', isLocal: ', local_image_set.is_local()) # create DistributedImageSet from an image distributed_image_set = ImageSet.read("/home/cdsw/image-augmentation/image/test.jpg", sc, 2) # create DistributedImageSet from an image folder distributed_image_set = ImageSet.read("/home/cdsw/image-augmentation/image/", sc, 2) # create LocalImageSet from image rdd image = cv2.imread("/home/cdsw/image-augmentation/image/test.jpg") image_rdd = sc.parallelize([image], 2) label_rdd = sc.parallelize([np.array([1.0])], 2) distributed_image_set = DistributedImageSet(image_rdd, label_rdd)