def test(model: keras.Model): model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy']) x_test, y_test = data.load_test('cifar10', channel_first=False) test_iter = data.get_test_iterator(x_test, y_test) model.evaluate(test_iter, batch_size=common.batch_size, steps=len(x_test) // common.batch_size)
def __init__(self, vocab_size): # 输入序列(句子)长度 self.sequence_length = FLAGS.sequence_length # 循环数 self.num_epochs = FLAGS.num_epochs # batch大小 self.batch_size = FLAGS.batch_size # 词表大小 self.vocab_size = vocab_size # 词向量大小 self.embedding_size = FLAGS.embedding_size # 不同类型的filter,相当于1-gram,2-gram,3-gram和5-gram self.filter_sizes = [1, 2, 3, 5] # 隐层大小 self.hidden_size = FLAGS.hidden_size # 每种filter的数量 self.num_filters = FLAGS.num_filters # 论文里给的是0.0001 self.l2_reg_lambda = FLAGS.l2_reg_lambda # dropout self.keep_prob = FLAGS.keep_prob # 学习率 # 论文里给的是0.01 self.lr = FLAGS.lr # margin # 论文里给的是0.009 self.m = FLAGS.margin # 设定GPU的性质,允许将不能在GPU上处理的部分放到CPU # 设置log打印 self.cf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) ''' GPU内存使用策略 ''' # 自动增长 self.cf.gpu_options.allow_growth = True # 只占用20%的GPU内存 # self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2 self.test_data = data.load_test(self.sequence_length, self.sequence_length)
print "m_paths" print m_paths metadata_path_all = glob.glob(m_paths) print "length of metadata_path_all" print(len(metadata_path_all)) if len(sys.argv) >= 4: subset = sys.argv[3] assert subset in ['train', 'valid', 'test', 'train_valid'] else: subset = "test" num_seq = 64 if subset == "test": xb_test, _, xs_test, _ = data.load_test(CVsplit) dat = utils.add_dims_seq([xb_test, xs_test]) xb_test = dat[0] xs_test = dat[1] elif subset == "train": sys.exit(subset + ": not implemented yet") elif subset == "train_valid": sys.exit(subset + ": not implemented yet") else: sys.exit(subset + ": not implemented yet") X = np.vstack((xb_test, xs_test)) n = np.size(X, axis=0) print("X shape:") print(X.shape)
norm = dfp.is_op('concatenate')(dfp.is_tuple( (dfp.wildcard(), dfp.wildcard(), dfp.wildcard(), dfp.wildcard(), dfp.wildcard(), dfp.wildcard()))) red = dfp.is_op('concatenate')(dfp.is_tuple( (dfp.wildcard(), dfp.wildcard(), dfp.wildcard(), dfp.wildcard()))) return [norm, red] if __name__ == '__main__': from nasnet import get_model from resnet import ConvBnSubst import graph import work import data x_test, y_test = data.load_test('cifar10', channel_first=True) test_gen = data.TvmDataGen(x_test, y_test) nasnet = get_model(6, load_weights=True) wl_1 = work.Workload.from_keras(nasnet, dtype='float16') wl_1.mod = avg_include_pool(wl_1.mod) wl_2 = graph.SubstPass(ConvBnSubst)(wl_1) wl_3 = graph.SubstPass(ConvAddSubst)(wl_2) wl_4 = graph.SubstPass(AvgAddSubst)(wl_3) graph.visualize(wl_2, name='nasnet_cb', path='logs') # wl_1.evaluate(test_gen) # wl_2.evaluate(test_gen) # wl_3.evaluate(test_gen) # wl_4.evaluate(test_gen) # pat_list = _get_breakpoint_patterns() # rcd_1 = work.BreakpointRecord(wl_1, pat_list) # rcd_2 = work.BreakpointRecord(wl_2, pat_list)
def trans_ans(y): ans = [] for i in y: if i[1] >= i[0]: ans.append(1) else: ans.append(0) return ans model = resnet.ResnetBuilder.build_resnet_50( (config.img_channels, config.img_rows, config.img_cols), config.nb_classes) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.load_weights(config.model_path) x_test = data.load_test(config.test_path) y = model.predict(x_test) y = trans_ans(y) id = [i for i in range(4000, 7550)] submit = pd.DataFrame({'id': id, 'y': y}) submit.to_csv(config.submitfile_path, index=False)
import theano.tensor as T import numpy as np import sys import data import model from theano_toolkit.parameters import Parameters from theano_toolkit import updates if __name__ == '__main__': model_filename = sys.argv[1] test_filename = sys.argv[2] train_filename = sys.argv[3] P = Parameters() data_X, df = data.load_test(test_filename, train_filename) f = model.build(P, input_size=data_X.shape[1], hidden_sizes=[256, 128, 64, 32] ) X = T.matrix('X') predict = theano.function( inputs=[X], outputs=f(X, test=True) > 0.5, ) P.load(model_filename) output = predict(data_X) print data_X.shape print output.shape print df.values.shape df['probs'] = predict(data_X)
writer.writerow(review.__dict__) def remove_diacritic(input): """ Accept a unicode string, and return a normal string without any diacritical marks. input arguments: input: the string to strip accents from output arguments: the stripped input """ return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore') if __name__ == "__main__": dataset = sys.argv[1] if dataset == 'train': reviews = data.load_train() elif dataset == 'test': reviews = data.load_test() else: raise ValueError('No dataset ' + dataset + ' found!') print "reviews loaded" reviews_dict_languages = split_by_language(reviews) for k, v in reviews_dict_languages.iteritems(): print k review_list = correct_spelling_and_stem(k, v) print "corrected and stemmed" save_reviews_to_csv(k, review_list, dataset) print "saved to csv"
def main(): logs = { 'start-time': now(), 'num_workers': PARTITIONS, 'reg_lambda': REG_LAMBDA, 'epochs': EPOCHS, 'batch': BATCH, 'learning_rate': LEARNING_RATE } # Logging configuration logging.basicConfig(filename='/data/logs/tmp_logs.txt', level=logging.WARNING) logging.warning("{}:Loading Training Data...".format(now())) # Load data val_df, train_df = data.load_train(spark) # Collect validation for loss computation val_collected = val_df.collect() # Create initial weight vector dimensions = train_df.rdd \ .map(lambda row: max(row.features.keys())).max() + 1 w = [0.0] * dimensions # Create the partitions of the train dataset partitions = train_df.rdd.zipWithIndex() \ .map(lambda x: (x[1], x[0])) \ .partitionBy(PARTITIONS) persistence = [0.0] * PERSISTENCE smallest_val_loss = float('inf') logs['start-compute-time'] = now() logging.warning("{}:Starting SGD...".format(logs['start-compute-time'])) logs['epochs-stats'] = [] for epoch in range(EPOCHS): epoch_stat = {'epoch_number': epoch, 'epoch_start': now()} logging.warning("{}:EPOCH:{}".format(now(), epoch)) # Broadcast w to make it available for each worker w_b = sc.broadcast(w) # Calculate Mini Batch Gradient Descent for each partition partition_deltas_w = \ partitions.mapPartitions(lambda x: sgd(x, w_b)).collect() # Collect total update weights for all workers in one epoch total_delta_w = {} for delta_w in partition_deltas_w: for k, v in delta_w.items(): if k in total_delta_w: total_delta_w[k] += v else: total_delta_w[k] = v # Update weights for k, v in total_delta_w.items(): w[k] += LEARNING_RATE * v val_loss = loss(val_collected, w) epoch_stat['val_loss'] = val_loss epoch_stat['epoch_end'] = now() logs['epochs-stats'].append(epoch_stat) logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss)) # Early stopping criteria persistence[epoch % PERSISTENCE] = val_loss if smallest_val_loss < min(persistence): # Early stop logging.warning("{}:EARLY STOP!".format(now())) break else: smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss logs['end-compute-time'] = now() logging.warning("{}:Calculating Train Accuracy".format(now())) train_accuracy = accuracy(train_df, w) logs['train_accuracy'] = train_accuracy logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy)) logging.warning("{}:Calculating Test Accuracy".format(now())) test_df = data.load_test(spark) test_accuracy = accuracy(test_df, w) logs['test_accuracy'] = test_accuracy logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy)) spark.stop() logs['end_time'] = now() with open( '/data/logs/logs.workers_{}.batch_{}.epochs_{}.time_{}.json'. format(PARTITIONS, BATCH, EPOCHS, logs['start-time']), 'w') as f: json.dump([logs], f)
model = MoCo(dim=args.moco_dim, K=args.moco_k, m=args.moco_m, T=args.moco_t, ver=args.version, arch=args.arch, bn_splits=args.bn_splits, symmetric=args.symmetric, v3_encoder=vit).cuda() print(model) # exit(0) train_data, train_loader = load_train(args) memory_data, memory_loader = load_memory(args) test_data, test_loader = load_test(args) # define optimizer if args.version == 3: optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.wd) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=0.9) # load model if resume epoch_start = 1 if args.resume != '':
print('Loading word2vec...') embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC, binary=True) print('Loading training data...') train = data.load_train() #[:LIMIT] dic = data.load_dic() print('Computing tweet averages...') X = np.zeros(shape=(len(train), 300)) y = np.zeros(shape=(len(train), ), dtype=int) for i, tweet in enumerate(train): X[i] = tweet_embedding_by_average(tweet[0], dic, embeddings) y[i] = tweet[1] print('Training the model...') clf = RandomForestClassifier(n_estimators=200, max_depth=10) clf.fit(X, y) print('Loading test data...') test = data.load_test() #[:LIMIT] T = np.zeros(shape=(len(test), 300)) ids = np.zeros(shape=(len(test), )) for i, tweet in enumerate(test): T[i] = tweet_embedding_by_average(tweet[0], dic, embeddings) ids[i] = tweet[1] print('Predicting...') Yhat = clf.predict(T) data.generate_submission(Yhat, ids)
import pickle from collections import defaultdict import data data.load_all_train() data.load_test() FLEXIBLE_PATTERN_RATIO = 1e-4 def main(): print("counting all words from the whole dataset...") word_freq = defaultdict(int) total_num_words = 0 for tweet in data.ALL_TRAIN + data.TEST: for word in tweet.normalised_text.lower().split(): word_freq[word] += 1 total_num_words += 1 print("counted!") print("defining high frequency words...") hfw_threshold = total_num_words * FLEXIBLE_PATTERN_RATIO hfw = set() for word in word_freq: if word_freq[word] > hfw_threshold: hfw.add(word) # save to file print("saving to ../data/hfws.pickle...")
predictions = predictions + np.load(predictions_path) # print "shape of predictions and max" # print(predictions.shape) # print(predictions.max()) predictions = predictions / len(predictions_path_all) # evening it out # print(predictions.max()) import data if len(sys.argv) == 4: subset = sys.argv[3] assert subset in ['train', 'valid', 'test', 'train_valid'] else: subset = "test" if subset == "test": xb_test, tb_test, _, ts_test = data.load_test(CVsplit) elif subset == "train": sys.exit(subset + ": not implemented yet") elif subset == "train_valid": sys.exit(subset + ": not implemented yet") else: sys.exit(subset + ": not implemented yet") t = np.vstack((tb_test, ts_test)) n = np.size(t, axis=0) import utils AUC = utils.auc(predictions, t) total_AUC += AUC predictions = predictions > prob
predictions = predictions + np.load(predictions_path) # print "shape of predictions and max" # print(predictions.shape) # print(predictions.max()) predictions = predictions / len(predictions_path_all) # evening it out # print(predictions.max()) import data if len(sys.argv) == 4: subset = sys.argv[3] assert subset in ["train", "valid", "test", "train_valid"] else: subset = "test" if subset == "test": xb_test, tb_test, _, ts_test = data.load_test(CVsplit) elif subset == "train": sys.exit(subset + ": not implemented yet") elif subset == "train_valid": sys.exit(subset + ": not implemented yet") else: sys.exit(subset + ": not implemented yet") t = np.vstack((tb_test, ts_test)) n = np.size(t, axis=0) import utils AUC = utils.auc(predictions, t) total_AUC += AUC
def main(): logs = { 'start-time': now(), 'lock': LOCK, 'num_workers': WORKERS, 'reg_lambda': REG_LAMBDA, 'epochs': EPOCHS, 'learning_rate': LEARNING_RATE } # Logging configuration logging.basicConfig(filename='logs/tmp_logs.txt', level=logging.WARNING) with Manager() as manager: logging.warning("{}:Loading Training Data...".format(now())) logging.warning("{}:FULL TEST {}".format(now(), FULL_TEST)) logging.warning("{}:WORKERS {}".format(now(), WORKERS)) logging.warning("{}:LOCK {}".format(now(), LOCK)) val, train = data.load_train() train = manager.dict(train) dim = max([max(k) for k in train['features']]) + 1 init_w = [0.0] * dim if LOCK: lock = Lock() w = Array(c_double, init_w, lock=lock) else: w = RawArray(c_double, init_w) logs['start-compute-time'] = now() start_time = time() logging.warning("{}:Starting SGD...".format( logs['start-compute-time'])) val_queue = Queue() workers = [] for worker in range(WORKERS): p = Process(target=sgd, args=(worker, train, w, val_queue)) p.start() workers.append(p) logs['epochs-stats'] = [] # Initial early stopping variables persistence = [0.0] * PERSISTENCE smallest_val_loss = float('inf') workers_done = [False] * WORKERS while True: workers_alive = any([p.is_alive() for p in workers]) if not workers_alive: logging.warning("{}:WORKERS DONE!".format(now())) logs['end-compute-time'] = now() logging.warning("{}:END TIME {}".format( now(), time() - start_time)) if not workers_alive and val_queue.empty(): logging.warning("{}:WORKERS DONE AND QUEUE EMPTY!".format( now())) final_weights = w[:] break # Block until getting a message val_queue_item = val_queue.get() worker = val_queue_item['worker'] epoch = val_queue_item['epoch'] weights = val_queue_item['weights'] val_loss = loss(val, weights) logging.warning("{}:EPOCH:{}".format(now(), epoch)) logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss)) logs['epochs-stats'].append({ 'epoch_number': epoch, 'val_loss': val_loss }) # Early stopping criteria persistence[epoch % PERSISTENCE] = val_loss if smallest_val_loss < min(persistence): # Early stop logging.warning("{}:EARLY STOP!".format(now())) # Terminate all workers, but save the weights before # because a worker could have a lock on them. Terminating # a worker doesn't release its lock. final_weights = w[:] for p in workers: p.terminate() logs['end-compute-time'] = now() logging.warning("{}:END TIME {}".format( now(), time() - start_time)) break else: smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss # Close queue val_queue.close() val_queue.join_thread() logging.warning("{}:Calculating Train Accuracy".format(now())) train_accuracy = accuracy(train, final_weights) logs['train_accuracy'] = train_accuracy logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy)) # Calculate test accuracy logging.warning("{}:Calculating Test Accuracy".format(now())) test = data.load_test(FULL_TEST) test_accuracy = accuracy(test, final_weights) logs['test_accuracy'] = test_accuracy logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy)) logs['end_time'] = now() with open( 'logs/logs.w_{}.l_{}.e_{}.time_{}.json'.format( WORKERS, LOCK, EPOCHS, logs['start-time']), 'w') as f: json.dump([logs], f)
def load_data(options): data = load_test() if options.attack is not None: attack_classes = get_attack_classes(options.attack) data = data[data.attack_class.isin(['Normal', *attack_classes])] return preprocess(data, normalize=options.normalize)
from data import load_train, load_test, predictions_to_csv model_name = 'VGG9' model = load_model(f'./project3/trained_models/{model_name}.h5') print(model.summary()) plot_model(model, to_file=f'./project3/figures/{model_name}_arch.png', show_shapes=True) num_classes = 10 img_x, img_y = 64, 64 # Load data train_images, train_labels = load_train() x_test = load_test() _, x_valid, _, y_valid = train_test_split(train_images, train_labels, test_size=0.2, random_state=42, stratify=train_labels) # Reshape and normalize x_valid = x_valid.reshape(x_valid.shape[0], img_x, img_y, 1) x_valid = x_valid.astype('float32') x_valid /= 255. print('Validation dim: ', x_valid.shape) x_test = x_test.reshape(x_test.shape[0], img_x, img_y, 1) x_test = x_test.astype('float32')
import time from data import load_test, predictions_to_csv, load_model if __name__ == '__main__': print('Loading data...') start = time.time() data_test = load_test() print(f'Time to load data: {time.time()-start}') # Replace with name of model you want load pipeline = load_model('sgd_bigram_tfidf.joblib') # Generate predictions pred = pipeline.predict(data_test) # Name of csv to save predictions in predictions_to_csv(pred, 'sgd_bigram_tfidf.csv')
def main( data_path, train_data_path, val_data_path, test_data_path, output_path, prediction_name='suggestion.json', cache_dir=None, model_type='lda', ): ''' train a model and make a prediction Args: data_path: path to the data json file train_data_path: path to the train data val_data_path: path to the val data test_data_path: path to the test data output_path: path to the output dir prediction_name: the name of prediction output file cache_dir: where to save cache model: which model to use Returns: None ''' # load data print('Loading data') documents, titles = data.load_doc_title( data_path, cache_path=os.path.join(cache_dir, 'preproccessed') if cache_dir is not None else None, ) train_data = data.load_train(train_data_path) val_data = data.load_val(val_data_path) test_data = data.load_test(test_data_path) # convert to corpus if needed if model_type in ('lda', ): print('Preparing corpus') dictionary = utils.make_dictionary( documents.content, cache_path=os.path.join(cache_dir, 'dictionary') if cache_dir is not None else None, filter_=False, ) documents['bow'] = utils.make_corpus(documents.content, dictionary) titles['bow'] = utils.make_corpus(titles.content, dictionary) # train print('Training model') if model_type == 'lda': model = engine.CustomLDA(documents, titles, dictionary) model = model.train(train_data, val_data, output_path) elif model_type == 'doc2vec': model = engine.CustomDoc2vec(documents, titles) model = model.train(train_data, val_data, output_path) else: raise ValueError(model_type) # inference prediction = model.predict(test_data) prediction_output = os.path.join(output_path, prediction_name) data.dump_prediction(prediction, prediction_output) return