def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--layout', dest="layout", required=True, help="layout.yaml") parser.add_argument('-c', '--config', dest="config", required=True, help="cluster configuration") parser.add_argument('-o', '--output', dest="output", required=True, help="cluster configuration") args = parser.parse_args() output_path = os.path.expanduser(args.output) layout = load_yaml_config(args.layout) config = load_yaml_config(args.config) masters, workers = get_masters_workers_from_layout(layout) head_node = masters[0] # fill in cpu, memory, computing_device information in both masters and workers # we assume the layout file the user gives is correct all_machines = masters + workers for machine in all_machines: sku_info = layout['machine-sku'][machine['machine-type']] # use math.ceil to guarantee the memory volume # e.g. if use set 999.1MB, we ensure there is 1000MB to avoid scheduling issues machine['memory_mb'] = math.ceil( parse_quantity(sku_info['mem']) / 1024 / 1024) machine['cpu_vcores'] = sku_info['cpu']['vcore'] if 'computing-device' in sku_info: machine['computing_device'] = sku_info['computing-device'] # add machine to different comupting device group computing_device_groups = defaultdict(list) for machine in all_machines: sku_info = layout['machine-sku'][machine['machine-type']] if 'computing-device' in sku_info: computing_device_groups[sku_info['computing-device'] ['type']].append(machine['hostname']) environment = { 'masters': masters, 'workers': workers, 'cfg': config, 'head_node': head_node, 'computing_device_groups': computing_device_groups, } map_table = {"env": environment} generate_template_file("quick-start/pre-check.yml.template", "{0}/pre-check.yml".format(output_path), map_table)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--layout', dest="layout", required=True, help="layout.yaml") parser.add_argument('-c', '--config', dest="config", required=True, help="cluster configuration") args = parser.parse_args() layout = load_yaml_config(args.layout) cluster_config = load_yaml_config(args.config) try: validate_layout_schema(layout) except Exception as exp: logger.error("layout.yaml schema validation failed: \n %s", exp) sys.exit(1) if not check_layout(layout, cluster_config): logger.error("layout.yaml schema validation failed") sys.exit(1) logger.info("layout.yaml schema validation succeeded.")
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--layout', dest="layout", required=True, help="layout.yaml") parser.add_argument('-c', '--config', dest="config", required=True, help="cluster configuration") parser.add_argument('-o', '--output', dest="output", required=True, help="cluster configuration") args = parser.parse_args() output_path = os.path.expanduser(args.output) layout = load_yaml_config(args.layout) cluster_config = load_yaml_config(args.config) masters, workers = get_masters_workers_from_layout(layout) head_node = masters[0] # Hivedscheduler is enabled by default. # But if the user sets enable_hived_scheduler to false manually, # we should disable it. if 'enable_hived_scheduler' in cluster_config and cluster_config[ 'enable_hived_scheduler'] is False: hived_config = {} else: hived_config = get_hived_config(layout, cluster_config) environment = { 'masters': masters, 'workers': workers, 'cfg': cluster_config, 'head_node': head_node, 'hived': hived_config } map_table = {"env": environment} generate_template_file( os.path.abspath( os.path.join( os.path.abspath(__file__), '../../quick-start/services-configuration.yaml.template')), "{0}/services-configuration.yaml".format(output_path), map_table)
def main(): config = load_yaml_config("config.yml") tokenizer_path = config["data"]["tokenizer_path"] seq_length = config["model"]["seq_length"] pb_path = config["model"]["pb_path"] label_size = config["model"]["label_size"] kl_threshold = config["model"]["kl_threshold"] tokenizer = pickle.load(open(tokenizer_path, "rb")) with tf.Session() as sess: tf.saved_model.loader.load(sess, [tag_constants.SERVING], pb_path) graph = tf.get_default_graph() input_x = graph.get_tensor_by_name("input_x:0") keep_prob = graph.get_tensor_by_name("dropout_keep_prob:0") pred = graph.get_tensor_by_name("softmaxLayer/probs:0") while True: trackString = input("请输入轨迹序列:") if trackString == "exit": print("退出检测") exit(0) track = trackString.strip().split(" ") if len(track) < 2: print("轨迹长度至少为2") continue unknow = [x for x in track if x not in tokenizer.word_index] if unknow: print("{}轨迹点不存在".format(unknow)) continue tokenizerSeq = tokenizer.texts_to_sequences([trackString])[0] for i in range(1, len(tokenizerSeq)): label = tokenizerSeq[i] feature = tokenizerSeq[(i - 4) if (i - 4) >= 0 else 0:i] if len(feature) < seq_length: feature = [1] * (seq_length - len(feature)) + feature feature_reshape = np.array(feature).reshape(-1, 4) logits = sess.run(pred, feed_dict={ input_x: feature_reshape, keep_prob: 1}) # 多分类交叉熵 loss = sess.run(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=[label]))[0] # kl离散度 onehot = [1e-6] * label_size onehot[label] = 1 kl = stats.entropy(logits[0], onehot) # 用kl离散度判断异常 abnormal = "[{}]".format(" " if kl <= kl_threshold else "×") print("{} 交叉熵:{:.4f}, kl离散度:{:.4f}, 移动轨迹:{} => {}".format( abnormal, loss, kl, track[(i - 4) if (i - 4) >= 0 else 0:i], track[i])) print("")
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--layout', dest="layout", required=True, help="layout.yaml") parser.add_argument('-c', '--config', dest="config", required=True, help="cluster configuration") parser.add_argument('-o', '--output', dest="output", required=True, help="cluster configuration") args = parser.parse_args() output_path = os.path.expanduser(args.output) layout = load_yaml_config(args.layout) cluster_config = load_yaml_config(args.config) masters, workers = get_masters_workers_from_layout(layout) head_node = masters[0] environment = { 'masters': masters, 'workers': workers, 'cfg': cluster_config, 'head_node': head_node } map_table = { "env": environment } generate_template_file( "quick-start/hosts.yml.template", "{0}/hosts.yml".format(output_path), map_table ) generate_template_file( "quick-start/prophet.yml.template", "{0}/prophet.yml".format(output_path), map_table )
def download_file(local, remote): config = load_yaml_config('certificate.yml') session = boto3.session.Session( aws_access_key_id=config.AWS_ACCESS_ID, aws_secret_access_key=config.AWS_SECRET_KEY) s3 = session.resource('s3') try: s3.Bucket(config.AWS_BUCKET).download_file(remote, local) print("Download " + remote + " as " + local) return True except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == '404': print("The object does not exists") print("Download fail.") return False
def train(): config = load_yaml_config("config.yml") display_step = config["model"]["display_step"] evaluate_step = config["model"]["evaluate_step"] save_step = config["model"]["save_step"] checkpoint_path = config["model"]["checkpoint_path"] pickle_path = config["data"]["pickle_path"] pb_path = config["model"]["pb_path"] model = TodAutoEncoder(config) print(model.input_x) print(model.loss) with open(pickle_path, "rb") as f: _ = pickle.load(f) _, sparse_test = pickle.load(f) card, sparse = zip(*sparse_test) test = dense_transform(list(sparse)) sess = get_session() sess.run(tf.global_variables_initializer()) batch_data = get_batch() for batch in batch_data: _, loss_train, step = model.step(sess, batch) if step % display_step == 0: print("step: %d => loss: %.4f" % (step, loss_train)) if step % evaluate_step == 0: _, loss_test, _ = model.step(sess, test) print("{0:-^30}".format("evaluation loss: %.4f" % loss_test)) print("") if step % save_step == 0: model.save(sess, checkpoint_path) model.save(sess, checkpoint_path) shutil.rmtree(pb_path, ignore_errors=True) builder = tf.saved_model.builder.SavedModelBuilder(pb_path) inputs = {'input_x': tf.saved_model.utils.build_tensor_info(model.input_x)} outputs = {'output': tf.saved_model.utils.build_tensor_info(model.loss)} signature = tf.saved_model.signature_def_utils.build_signature_def( inputs=inputs, outputs=outputs, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) builder.add_meta_graph_and_variables(sess, [tag_constants.SERVING], {'my_signature': signature}) builder.save()
def upload_file(local, remote): config = load_yaml_config('certificate.yml') session = boto3.session.Session( aws_access_key_id=config.AWS_ACCESS_ID, aws_secret_access_key=config.AWS_SECRET_KEY) s3 = session.client('s3') try: s3.upload_file(local, config.AWS_BUCKET, remote) ''' s3.put_object(Bucket=config.AWS_BUCKET, Key=remote, Body=local) ''' print("Upload " + local + " as " + remote) return True except: print("Upload fail.") return False
def train(): config = load_yaml_config("config.yml") embedding_size = config["model"]["embedding_size"] min_count = config["model"]["min_count"] sg = config["model"]["sg"] output_path = config["data"]["output_path"] window = config["model"]["window"] embedding_model = config["model"]["embedding_model"] embedding_path = config["model"]["embedding_path"] sentences = word2vec.LineSentence(output_path) model = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count, sg=sg, workers=multiprocessing.cpu_count()) print("word counts: %d" % len(model.wv.vocab.keys())) model.save(embedding_model) model.wv.save_word2vec_format(embedding_path, binary=False)
def get_batch(): config = load_yaml_config("config.yml") pickle_path = config["data"]["pickle_path"] epochs = config["model"]["epochs"] batch_size = config["model"]["batch_size"] sparse_list = load_train_pkl(pickle_path) card, sparse = zip(*sparse_list) sparse = list(sparse) data_length = len(sparse) # repeat for epoch in range(epochs): # shuffle random.shuffle(sparse) # batch_size for batch in range(0, data_length, batch_size): if batch + batch_size <= data_length: sparse_batch = sparse[batch:(batch + batch_size)] # 对稀疏矩阵进行dense转化 dense_list = dense_transform(sparse_batch) yield dense_list
def main(): config = load_yaml_config("config.yml") pickle_path = config["data"]["pickle_path"] pb_path = config["model"]["pb_path"] with open(pickle_path, "rb") as f: _ = pickle.load(f) sparse_train, sparse_test = pickle.load(f) card_st = sparse_train + sparse_test with tf.Session() as sess: tf.saved_model.loader.load(sess, [tag_constants.SERVING], pb_path) graph = tf.get_default_graph() input_x = graph.get_tensor_by_name("input_x:0") pred = graph.get_tensor_by_name("cost/absolute_difference/value:0") # 预测 with open("tod_result.csv", "w", encoding="utf8") as f: f.write("{},{}\n".format("cardno", "loss")) for card, st_sparse in card_st: st_dense = st_sparse.toarray().reshape(1, -1) res = sess.run(pred, feed_dict={input_x: st_dense}) f.write("{},{}\n".format(card, res))
sparse_list = [] for card, st_dict in tfidfCstDict.items(): # 创建稀疏矩阵 sparse_mat = dok_matrix((loc_length, hour_length), dtype=np.float32) for space, hour_dict in st_dict.items(): for hour, tfidf in hour_dict.items(): space_index = locToIndex[space] sparse_mat[space_index, hour] = tfidf sparse_list.append((card, sparse_mat)) return sparse_list, locToIndex def dump_pickel(): tfidfCstDict, locCountDict, hourCountDict = get_space_time_dict() sparse_list, locToIndex = get_space_time_sparse(tfidfCstDict, locCountDict, hourCountDict) # train_test_split sparse_train, sparse_test = train_test_split(sparse_list, test_size=0.2) with open(pickle_path, "wb") as f: pickle.dump(locToIndex, f) pickle.dump((sparse_train, sparse_test), f, protocol=0) if __name__ == "__main__": config = load_yaml_config("config.yml") file_path = config["data"]["file_path"] pickle_path = config["data"]["pickle_path"] dump_pickel()
# Use None or 0 if you want to return all possible neighbors in the select distance. config.batch_size = 32 # Training batch size of fnn models. config.epochs = [50, 300] config.epochs_train2 = 300 config.epochs_interval = 50 # Epochs is a list of len=2 containing the range of epochs after which stop training of M1 models and train a new model M2. # M1's training will stop after epochs[0]+n*interval such that n>0 and epochs[0]+n*interval<=epochs[1] # M2's training will last epochs_train2 epochs. config.epochs_interval_evaluation = 1 # M2's training will stop epochs_interval_evaluation epochs to evaluate performance # M1's training will stop to evaluate performance only if test1=True config.folds_number = 10 # Numbers of K-fold CV folds. config.embedding_name = "tuned_embedding" # The embedding to be used. There must be a directory containing the embedding in data folder. config.test1 = False # True if you want to evaluate M1's performances trainings on test set. Use False to skip the evaluation. config.OUTPUTS_DIR = None # The base path in which tests' outputs will be saved. Set as None if you want to store them in project's dir. config.embedding_dict_to_use = None # If you want to use the dictionary of another embedding, set this parameter with the embedding name. Use None otherwise. # There must be a directory containing the embedding in data folder. config = load_yaml_config( config, os.path.join( os.path.dirname(os.path.abspath(__file__)), "coherence_test_config.yaml" ), )
from lettuce import step, world from nose.tools import assert_equals, assert_true, assert_false import utils import os import bunch.special path = os.path.abspath(__file__) dir_path = os.path.dirname(path) utils.init(dir_path) config_file = os.path.join(dir_path, "config.yaml") config = utils.load_yaml_config(config_file) bunch_working_dir = dir_path def dump(obj): for attr in dir(obj): print "obj.%s = %s" % (attr, getattr(obj, attr)) mysql_admin = config['db']['admin'] mysql_admin_pwd = config['db']['admin_pwd'] class step_assert(object): def __init__(self, step): self.step = step def assert_true(self, expr): msg = 'Step "%s" failed ' % self.step.sentence assert_true(expr, msg)
config = Config() config.ocean_traits = [0, 1, 2, 3, 4] # OCEAN personality traits to which tune the embedding: O:0, C:1, E:2, A:3, N:4 config.epochs_number = 10 # NLP model's training epochs config.num_reviews = 1500000 # number of reviews to use for training (training set + test set) config.voc_dim = 6 * 10**4 # number of terms in the tuned embedding config.train_zeros = False # use True if you want to train weights representing padding's tokens, use False otherwise. config.output_type = "mean" # target of the model: 'mean' or 'sum' of known terms' scores in the review. config.shuffle = True # if True review from yelp dataset will be shuffled before extracting num_reviews reviews. # if False the first num_reviews of yelp dataset will be extracted. config.features_config = [100, int(100 / 2), int(100 / 4)] # configuration of NLP model's architecture: features, filters and hidden units. config.embedding_name = "new_tuned_embedding" # name of the dir to be created that stores the tuned embedding. config.load_reviews_from_scratch = False # use False if you have already loaded and stored reviews, use True if you want to reload and restore reviews. config.tune_embedding = True # use True to train the model, use False otherwise (eg if you just want to load reviews). config = load_yaml_config( config, os.path.join(os.path.dirname(os.path.abspath(__file__)), "tune_embedding_config.yaml"), )
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--layout', dest="layout", required=True, help="layout.yaml") parser.add_argument('-c', '--config', dest="config", required=True, help="cluster configuration") parser.add_argument('-o', '--output', dest="output", required=True, help="cluster configuration") args = parser.parse_args() output_path = os.path.expanduser(args.output) layout = load_yaml_config(args.layout) cluster_config = load_yaml_config(args.config) masters, workers = get_masters_workers_from_layout(layout) head_node = masters[0] if 'openpai_kube_network_plugin' not in cluster_config or cluster_config[ 'openpai_kube_network_plugin'] != 'weave': count_input = 0 while True: user_input = input( "Are your cluster is in Azure cloud or not? (Y/N) (case sensitive)" ) if user_input == "N": break if user_input == "Y": break print(" Please type Y or N. It's case sensitive.") count_input = count_input + 1 if count_input == 3: logger.error( "3 Times......... Sorry, we will force stopping your operation." ) sys.exit(1) if user_input == "Y" \ and ('openpai_kube_network_plugin' not in cluster_config or cluster_config['openpai_kube_network_plugin'] == 'calico'): logger.error( "Azure does not support calico, please change the openpai_kube_network_plugin to weave" ) logger.error( "https://docs.projectcalico.org/reference/public-cloud/azure#why-doesnt-azure-support-calico-networking" ) sys.exit(1) environment = { 'masters': masters, 'workers': workers, 'cfg': cluster_config, 'head_node': head_node } map_table = {"env": environment} generate_template_file("quick-start/hosts.yml.template", "{0}/hosts.yml".format(output_path), map_table) generate_template_file("quick-start/openpai.yml.template", "{0}/openpai.yml".format(output_path), map_table)
from lettuce import step, world from nose.tools import assert_equals, assert_true, assert_false import utils import os import bunch.special path = os.path.abspath(__file__) dir_path = os.path.dirname(path) utils.init(dir_path) config_file = os.path.join(dir_path, "config.yaml") config = utils.load_yaml_config(config_file) bunch_working_dir = dir_path def dump(obj): for attr in dir(obj): print "obj.%s = %s" % (attr, getattr(obj, attr)) mysql_admin = config['db']['admin'] mysql_admin_pwd = config['db']['admin_pwd'] class step_assert(object): def __init__(self, step): self.step = step def assert_true(self, expr): msg = 'Step "%s" failed ' % self.step.sentence assert_true(expr, msg) def assert_false(self, expr): msg = 'Step "%s" failed ' % self.step.sentence assert_false(expr, msg)