def __init__(self, root='./data', train=True, seq_len=100, shuffle=False, fragment=1, features=['all'], overlapping=True): self.root = root self.train = train self.seq_len = seq_len self.shuffle = shuffle self.features = features self.overlapping = overlapping all_X, all_Y = load_data() # Normalize data mean, std = load_mean_std() all_X = [np.divide(np.subtract(x, mean), std) for x in all_X] start_test = 50 if self.train: self.data = all_X[:start_test], all_Y[:start_test] else: self.data = all_X[start_test:62], all_Y[start_test:62] self._possible_starts = list(self._compute_possible_starts()) if self.shuffle: np.random.shuffle(self._possible_starts) nb_sequences = int(len(self._possible_starts) * fragment) self._possible_starts = self._possible_starts[:nb_sequences]
def load_dataset(path, config): print('Loading data: ' + path) train, valid, test = read_data.load_data(path, n_words=config.vocab_size, \ valid_portion=0.15, maxlen=config.maxlen) train = read_data.prepare_data(train[0], train[1], maxlen=config.maxlen) valid = read_data.prepare_data(valid[0], valid[1], maxlen=config.maxlen) test = read_data.prepare_data(test[0], test[1], maxlen=config.maxlen) return (train, valid, test)
para = parameters() samplingType = 'farthest_sampling' pointNumber = para.pointNumber neighborNumber = para.neighborNumber print 'Hyper-parameter:' print 'The point number and the nearest neighbor number is {} and {}'.format(para.pointNumber, para.neighborNumber) print 'The first and second layer filter number is {} and {}'.format(para.gcn_1_filter_n, para.gcn_2_filter_n) print 'The resolution for second layer is {} and the point number in cluster is {}'.format(para.clusterNumberL1, para.nearestNeighborL1) print 'The fc neuron number is {} and the output number is {}'.format(para.fc_1_n, para.outputClassN) print 'The Chebyshev polynomial order for each layer are {} and {}'.format(para.chebyshev_1_Order, para.chebyshev_2_Order) print 'The weighting scheme is {} and the weighting scaler is {}'.format(para.weighting_scheme, para.weight_scaler) # ===============================Build model============================= trainOperaion, sess = model_architecture(para) # ================================Load data=============================== inputTrain, trainLabel, inputTest, testLabel = load_data(pointNumber, samplingType) scaledLaplacianTrain, scaledLaplacianTest = prepareData(inputTrain, inputTest, neighborNumber, pointNumber) # ===============================Train model ================================ saver = tf.train.Saver() learningRate = para.learningRate modelDir = para.modelDir save_model_path = modelDir + "model_" + para.fileName weight_dict = weight_dict_fc(trainLabel, para) testLabelWhole = [] for i in range(len(testLabel)): labels = testLabel[i] [testLabelWhole.append(j) for j in labels] testLabelWhole = np.asarray(testLabelWhole)
for i in X_train[0]: s = s + show(i) + ' ' cnt += 1 if cnt % 28 == 0: print(s) s = '' X_train = X_train * 1.0 / 255 Y_train = data['dataset'][0][0][0][0][0][1] print(Y_train[0]) n_train = 124800 X_val = data['dataset'][0][0][1][0][0][0] X_val = X_val * 1.0 / 255 Y_val = data['dataset'][0][0][1][0][0][1] n_val = 20800 ''' X, Y, _, _ = load_data('emnist-letters.mat') X_train, Y_train = X ''' s = '' cnt = 0 for i in range(28): for j in range(28): s = s + show(int(X_train[0][i][j]*255)) + ' ' cnt= cnt + 1 if cnt % 28 == 0: print(s) s = '' print(Y_train[0]) ''' n_train = X_train.shape[0] X_val, Y_val = Y
### Amazon Review Summarizer ### Run TF-IDF # The purpose of this file is to visualize the calculated tf-idfs from tfidf.py # so we can analyze it for the next steps of the project. import os import json import nltk # must change the import in tfidf to run this from tfidf import calculate_tfidf from read_data import load_data if __name__ == '__main__': product_dict = load_data() #product_list = sorted(product_dict.keys(), reverse = True) # sort descending order by num reviews for productID in product_dict: print('productID: ' + productID + '\n') tfidfs = calculate_tfidf(product_dict[productID]) for reviewerID, tfidf in tfidfs.items(): print('reviewerID: ' + reviewerID) threshold = sum(tfidf.values()) / len(tfidf) sorted_tfidf = sorted( [(k, v) for k, v in tfidf.items() if v > threshold], key=lambda x: -x[ 1]) # excludes term if tfidf is below threshold for x in sorted_tfidf: print(x) print() break # stops after first product
import tensorflow.compat.v1 as tf import numpy as np hyperparameters = Hyperparameters() with tf.Graph().as_default(): # ========================================================================================================= # BUILD MODEL # ========================================================================================================= train_operation = model.model_architecture(hyperparameters) # ========================================================================================================= # LOAD DATA # ========================================================================================================= input_train, train_label, input_test, test_label = read_data.load_data( hyperparameters.num_points) scaled_laplacian_train, scaled_laplacian_test = read_data.prepare_data( input_train, input_test, hyperparameters.num_neighhbors, hyperparameters.num_points) # ========================================================================================================= # TRAIN MODEL # ========================================================================================================= init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) saver = tf.train.Saver() learning_rate = hyperparameters.learning_rate save_model_path = '../model/'
""" print "Running cross validation..." (Xcv, ycv) = data kfold = KFold(n_splits=cv, shuffle=True, random_state=42) results = [] for train_idx, val_idx in kfold.split(Xtrain): pipeline.fit(Xcv[train_idx], ycv[train_idx]) results.append(accuracy_score( ycv[val_idx], pipeline.predict(Xcv[val_idx]) )) print "{} +/- {}".format(np.mean(results), np.std(results)) if __name__ == '__main__': print "Importing data..." Xtrain, ytrain = load_data("train",kick_eos=True,kick_bos=True) Xdev, ydev = load_data("dev",kick_bos=True,kick_eos=True) params = { "svm": { 'classifier__C': [1000, 2000], 'classifier__gamma': [0.001, 0.005] }, "nn_mlp": { "classifier__hidden_layer_sizes": [(100,), (100, 100,)], "classifier__activation": ['tanh', 'logistic'], "classifier__alpha": [0.001], "classifier__max_iter": [5000, 8000, 10000] } } classifiers = {
def train_input_fn(): ''' create training input tensors ''' global dataframe train = dataframe.sample(frac=0.75) return input_fn(train) def eval_input_fn(): ''' create testing input tensors ''' global dataframe test = dataframe.sample(frac=0.20) return input_fn(test) _, _, dataframe = load_data() model = None def create_model(): ''' create the model using tf.Learn library to create a wide-n-deep neural network as per tutorial Later this would become a simple linear regression model although this was not the case ''' global dataframe global model mdir = tempfile.mkdtemp() cat_col = [ #tf.sparse_placeholder(x)
'graph_3': tf.placeholder(tf.float32, [None, para.vertexNumG3 * para.vertexNumG3], name='graph3'), 'poolIndex_1': tf.placeholder(tf.int32, [None, para.vertexNumG2 * para.poolNumG1], name='poolIndex1'), 'poolIndex_2': tf.placeholder(tf.int32, [None, para.vertexNumG3 * para.poolNumG2], name='poolIndex2'), 'poolIndex_3': tf.placeholder(tf.int32, [None, para.vertexNumG4 * para.poolNumG3], name='poolIndex3') # 'lr': tf.placeholder(tf.float32, name='lr'), } # ================================Load data=============================== inputTrain, trainLabel, inputTest, testLabel = read_data.load_data( para.vertexNumG1, para.samplingType, para.dataDir) # layer_1: (1)graph generate scaledLaplacianTrain, scaledLaplacianTest = read_data.prepareData( inputTrain, inputTest, para.edgeNumG1, para.vertexNumG1, para.dataDir) train_weight_dict = utils.train_weight_dict(trainLabel, para) eval_weight_dict = utils.eval_weight_dict(testLabel) # ================================Create model=============================== model = RPGCN(para, placeholders, logging=True) # =============================Initialize session============================= sess = tf.Session() # ==============================Init variables=============================== if para.restoreModel: model.load(para.ckptDir, sess) else: sess.run(tf.global_variables_initializer()) # =============================Graph Visualizing=============================
# ===============================Hyper parameters======================== para = Parameters() para.info() para.log() # ============================Define placeholders========================== placeholders = { 'isTraining': tf.placeholder(tf.bool, name='is_training'), 'coordinate': tf.placeholder(tf.float32, [None, para.pointNumber, para.input_data_dim], name='coordinate'), 'label': tf.placeholder(tf.float32, [None, para.outputClassN], name='label'), } # ================================Load data=============================== inputTrain, trainLabel, inputTest, testLabel = read_data.load_data( para.pointNumber, para.samplingType, para.dataDir) scaledLaplacianTrain, scaledLaplacianTest = read_data.prepareData( inputTrain, inputTest, para.neighborNumber, para.pointNumber, para.dataDir) # ================================Create model=============================== model = models.PointNet(para, placeholders, logging=True) # =============================Initialize session============================ sess = tf.Session() # ==============================Init variables=============================== if para.restoreModel: model.load(para.ckptDir, sess) else: sess.run(tf.global_variables_initializer()) # =============================Graph Visualizing============================= TIMESTAMP = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) merged_summary = tf.summary.merge_all() # train log
if __name__ == "__main__": #configs config = Config() config.layer = int(sys.argv[1]) config.step = int(sys.argv[2]) print("dataset: " + sys.argv[3]) print("iteration: " + str(config.layer)) print("step: " + str(config.step)) print("model: " + str(sys.argv[4])) #word2vec f = open(vector_path, 'rb') matrix = np.array(pickle.load(f)) config.vocab_size = matrix.shape[0] #load datasets train_dataset, valid_dataset, test_dataset = read_data.load_data(\ path=path,n_words=config.vocab_size) config.num_label = len(set(train_dataset[1])) print("number label: " + str(config.num_label)) train_dataset = read_data.prepare_data(train_dataset[0], train_dataset[1]) valid_dataset = read_data.prepare_data(valid_dataset[0], valid_dataset[1]) test_dataset = read_data.prepare_data(test_dataset[0], test_dataset[1]) with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_normal_initializer(0, 0.05) classifier = Classifer(config=config, session=session) total = 0 #print trainable variables for v in tf.trainable_variables(): print(v.name)
def main(): ''' method to be the main method so I can use it from an interpreter session Trains and evaluates a (linear) model. ''' DEBUG = True training_sessions = 200 # 10k now b/c i want to see how not terrible it /can/ get display_step = 10 save_step = 5 x_i, model, optimizer, y_act, cost, summary = create_model() print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') print(model) print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') _, _, data = load_data() if DEBUG: print(data[0:4]) saver = tf.train.Saver() with tf.Session() as sess: init(sess, saver) for i in range(training_sessions + 1): train_x, train_y = get_train_data(data) test_x, test_y = get_test_data(data) ###### for j in range(len(train_x)): x_in = force_feed(train_x[j]) y_out = train_y[j] sess.run(optimizer, feed_dict={x_i: x_in, y_act: y_out}) #print(optimizer.eval()) if i % save_step == 0: saved_path = save(sess, saver) #saver.save(sess, CHECKPOINT_FILE) print('saved step %d at %s' % (i, saved_path)) if i % display_step == 0: r = 0 # later randomize #loss = sess.run(cost, feed_dict=feed(test_x, test_y)) x_in = force_feed(test_x[r]) y_o = test_y[r] y_f = sess.run(model, feed_dict={x_i: x_in}) #, y_act:y_o}) c = sess.run(cost, feed_dict={x_i: x_in, y_act: y_o}) print('c') print(str(c)) #print(c.eval()) print('input: %s, output: %f, expected: %f, cost %f' % \ (test_x[r], y_f, y_o, c)) print('%d step; cost: %s' % (i, c)) # LaTeX table of "results" (easy enough to grep/sed out later) # Headers: Iteration number, expected, predicted, cost print('%% %d & %.5ff & %.5f & %.2f \\\\' % (i, y_o, y_f, c)) #print(c.eval()) # Summarize stuff summary = tf.summary.merge_all() fw = tf.summary.FileWriter('./logs', sess.graph) ###### '''
""" print "Running cross validation..." (Xcv, ycv) = data kfold = KFold(n_splits=cv, shuffle=True, random_state=42) results = [] for train_idx, val_idx in kfold.split(Xtrain): pipeline.fit(Xcv[train_idx], ycv[train_idx]) results.append( accuracy_score(ycv[val_idx], pipeline.predict(Xcv[val_idx]))) print "{} +/- {}".format(np.mean(results), np.std(results)) if __name__ == '__main__': kick_ebos = True Xtrain, ytrain = load_data("train", kick_eos=kick_ebos, kick_bos=kick_ebos) Xdev, _ = load_data("test", kick_bos=kick_ebos, kick_eos=kick_ebos) nac_pipeline = Pipeline([ ('features', FeatureUnion([ ('5gram_perplexity', KenLMPerplexity(ngram=5)), ('3gram_perplexity', KenLMPerplexity(ngram=3)), ('4gram_perplexity', KenLMPerplexity(ngram=4)), ('6gram_perplexity', KenLMPerplexity(ngram=6)), ('2gram_perplexity', KenLMPerplexity(ngram=2)), ('type_tokens_ratio', TypeTokenRatiosFeature()), ])), ('nn_mlp', MLPClassifier(hidden_layer_sizes=(
# [email protected] 2020/03/06 from model import MLP from read_data import load_data import random import matplotlib.pyplot as plt test_data, test_label, train_data, train_label = load_data() test_set = [] for m in zip(test_data, test_label): test_set.append([m[0], m[1]]) nn = MLP(4, 2, 3, h_w=[random.random() for _ in range(8)], h_b=0.3, o_w=[random.random() for _ in range(6)], o_b=0.6, learn_rate=0.2) record = [] print("正在训练中....请稍后") for epoch in range(1000): for i in zip(train_data, train_label): nn.train(i[0], i[1]) if epoch % 10 == 0: error = round(nn.total_error(test_set), 9) # print("训练轮数: ", epoch, "\n测试集误差: ", error)
"""import modules""" import read_data as rd FILE_NAME = 'items.txt' process_data = rd.load_data(FILE_NAME) def sort_data(choice, order): """Function to sort data""" if order == 'asc': sorted_items = sorted(process_data, key=lambda el: el[choice]) else: sorted_items = sorted(process_data, key=lambda el: el[choice], reverse=True) return sorted_items
start_time = time.time() test_acc, _ = run_epoch(session, config, model, test_dataset, tf.no_op(), 1, False) print("Eval Accuracy = %.2f time: %.3f\n" % (100 * test_acc, time.time() - start_time)) def word_to_vec(matrix, session, config, *args): for model in args: session.run(tf.assign(model.embedding, matrix)) if __name__ == "__main__": config = Config() train_dataset, test_dataset = read_data.load_data(path=config.data_path + "parsed_data/") # conver datas into matrix train_dataset = read_data.prepare_data(train_dataset[0], train_dataset[1], train_dataset[2]) test_dataset = read_data.prepare_data(test_dataset[0], test_dataset[1], test_dataset[2]) with tf.Graph().as_default(), tf.Session( config=tf.ConfigProto()) as session: classifier = Classifer(config=config, session=session) session.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=config.max_to_keep) for epoch_id in range(config.max_max_epoch): train_test_model(config, epoch_id, session, classifier, train_dataset, test_dataset, saver)
print(domain_size) if domain_size <= 0: print("No dataset") exit(1) return domain_size, domain_list if __name__ == "__main__": #configs config = Config() domain_size, domain_list = get_domains() #load dataset train_datasets, valid_datasets, test_datasets = [], [], [] for domain in domain_list: train, valid, test = read_data.load_data(path='dataset'+config.dataset+'/'+domain+'/dataset',n_words=config.vocab_size, \ valid_portion=config.valid_portion, maxlen=config.maxlen) train_datasets.append(train) valid_datasets.append(valid) test_datasets.append(test) #transform dataset to matrix for index in range(domain_size): train = read_data.prepare_data(train_datasets[index][0], train_datasets[index][1], maxlen=config.maxlen, traindata=True, index=index) valid = read_data.prepare_data(valid_datasets[index][0], valid_datasets[index][1], maxlen=config.maxlen, traindata=False,
def main(unused_args): #configs config = Config() #domains to be processed domain_list = sys.argv[1:] domain_size = len(domain_list) if domain_size <= 0: print("No dataset") exit(1) #load dataset train_datasets, valid_datasets, test_datasets = [], [], [] for domain in domain_list: train, valid, test = read_data.load_data(path='dataset'+config.dataset+'/'+domain+'/dataset',n_words=config.vocab_size, \ valid_portion=config.valid_portion, maxlen=config.maxlen) train_datasets.append(train) valid_datasets.append(valid) test_datasets.append(test) #transform dataset to matrix for index in range(domain_size): train = read_data.prepare_data(train_datasets[index][0], train_datasets[index][1], maxlen=config.maxlen, traindata=True) valid = read_data.prepare_data(valid_datasets[index][0], valid_datasets[index][1], maxlen=config.maxlen, traindata=False) test = read_data.prepare_data(test_datasets[index][0], test_datasets[index][1], maxlen=config.maxlen, traindata=False) train_datasets[index] = train valid_datasets[index] = valid test_datasets[index] = test config.num_classes = count_labels(train_datasets[0][2]) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as session: initializer = tf.random_normal_initializer(0, 0.05) #training model for shared weights with tf.variable_scope("shared_model", reuse=None, initializer=initializer): share_model_train = EmbeddingModel(is_training=True, config=config, session=session, trainable=True) #testing model for shared weights with tf.variable_scope("shared_model", reuse=True, initializer=initializer): share_model_test = EmbeddingModel(is_training=False, config=config, session=session, trainable=True) #build models train_models = [] test_models = [] for index in range(domain_size): with tf.variable_scope("m" + str(index), reuse=None, initializer=initializer): train_model = Combine_two_model(share_model_train, config) with tf.variable_scope("m" + str(index), reuse=True, initializer=initializer): test_model = Combine_two_model(share_model_test, config) train_models.append(train_model) test_models.append(test_model) init = tf.global_variables_initializer() session.run(init) #initialize share model's embedding with word2vec word_to_vec(session, config, share_model_train) #train test model train_test_model(config, session,\ train_models,test_models,test_models,\ train_datasets,valid_datasets,test_datasets)
import sklearn as sk import read_data import numpy as np import pandas #import dttry import decisiontree import svm import sklearn.tree as tree import pydotplus from sklearn.externals.six import StringIO from IPython.display import Image from sklearn import tree from decisiontree import MyDecisionTree from sklearn.metrics import accuracy_score X, y = read_data.load_data("spambase.data") X_train, X_test, y_train, y_test = read_data.split_data(X, y) #print("X_train shape", X_train.shape) #print("y_train shape", y_train.shape) #print("X_test shape", X_test.shape) #print("y_test shape", y_test.shape) #print(y_test) my_dt = MyDecisionTree(X_train, y_train) dt = my_dt.fit() my_dt_predict = my_dt.predict(X_test) print("MyDecisionTree accuracy: ", accuracy_score(y_test, my_dt_predict) * 100) #clf = tree.DecisionTreeClassifier() #clf.fit(X_train, y_train)
# Pre-training commands (Needs to be run only once) import read_data from read_data import load_data, create_match_file, generate_test_data def create_label_file(filename, matches): with open(filename,'w') as f: for match in matches: label = None if match._label: label = '1' else: label = '0' f.write(label + "\n") f.close() # Create training data matches = load_data(10000) create_match_file('dota.25.train', matches,seconds=1500, clustering=0) # Create dev/test data matches = generate_test_data(start=10001,number_of_points=100) create_match_file('dota.25.dev', matches,seconds=1500, clustering=0) create_label_file('dota.25.label', matches)
#======================================================================= pointNumber = para.pointNumber neighborNumber = para.neighborNumber os.environ['CUDA_VISIBLE_DEVICES'] = str(para.gpu) with tf.Graph().as_default(): # ===============================Build model============================= trainOperaion = model_architecture(para) #init = tf.global_variables_initializer() #sess = tf.Session() #sess.run(init) # ================================Load data=============================== if para.dataset == 'ModelNet40': inputTrain, trainLabel, inputTest, testLabel = load_data( pointNumber, samplingType) elif para.dataset == 'ModelNet10': ModelNet10_dir = '/raid60/yingxue.zhang2/ICASSP_code/data/' with open(ModelNet10_dir + 'input_data', 'rb') as handle: a = pickle.load(handle) inputTrain, trainLabel, inputTest, testLabel = a else: print "Please enter a valid dataset" scaledLaplacianTrain, scaledLaplacianTest = prepareData( inputTrain, inputTest, neighborNumber, pointNumber) # ===============================Train model ================================ init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) saver = tf.train.Saver()
from HDF5_loader import * from parameters import * import numpy as np import read_data import utils para = Parameters() inputTrain, trainLabel, inputTest, testLabel = read_data.load_data( 2048, 'farthest_sampling', '/home/ym/PycharmProjects/Fundamentals_of_Python/PointClassfication/data/') trainLabel = np.concatenate([value for value in trainLabel.values()]).tolist() testLabel = np.concatenate([value for value in testLabel.values()]).tolist() trainCoor = np.concatenate([value for value in inputTrain.values()]) testCoor = np.concatenate([value for value in inputTest.values()]) class_index = [[] for i in range(40)] for i in range(40): for n, label in enumerate(trainLabel): if label == i: class_index[i].append(n) coor = np.array(trainCoor[class_index[0][0]]) coor = np.expand_dims(coor, axis=0) print(coor.shape) IndexL1, centroid_coordinates_1 = utils.farthest_sampling_new(coor, M=512, k=4, batch_size=1, nodes_n=2048)