def train(): uid = request.get_json()['user'] trainsongs = request.get_json()['songs'] trainsongs.sort() modelfile = krotos.modelfname(trainsongs, uid) conn = engine.connect() df = pd.read_sql( select(selectsamples12).where(samples.c.testdata == False), conn) testdf = pd.read_sql( select(selectsamples12).where(samples.c.testdata == True), conn) df = df.loc[df['songid'].isin(trainsongs)] testdf = testdf.loc[testdf['songid'].isin(trainsongs)] X, Y = df.values[:, 1:], df.values[:, 0] x, y = testdf.values[:, 1:], testdf.values[:, 0] X = StandardScaler().fit_transform(X) x = StandardScaler().fit_transform(x) X = X.astype('float32') x = x.astype('float32') Y = LabelEncoder().fit_transform(Y) y = LabelEncoder().fit_transform(y) Y = krotos.padlabels(Y) y = krotos.padlabels(y) acc = krotos.train(X, Y, x, y, modelfile) if acc: return {"status": "ok", "modelfile": modelfile, "accuracy": acc} else: return {"status": "not ok"}, 500
def comparealgos(): from sklearn import metrics from sklearn.preprocessing import MinMaxScaler import autokeras as ak trainsongs = [1, 2] uid = 1 modelfile = krotos.modelfname(trainsongs, uid) conn = engine.connect() df = pd.read_sql( select(selectsamples12).where(samples.c.testdata == False), conn) testdf = pd.read_sql( select(selectsamples12).where(samples.c.testdata == True), conn) df = df.loc[df['songid'].isin(trainsongs)] testdf = testdf.loc[testdf['songid'].isin(trainsongs)] X, Y = df.values[:, 1:], df.values[:, 0] x, y = testdf.values[:, 1:], testdf.values[:, 0] X = StandardScaler().fit_transform(X) x = StandardScaler().fit_transform(x) X = X.astype('float32') x = x.astype('float32') Y = LabelEncoder().fit_transform(Y) y = LabelEncoder().fit_transform(y) Y = krotos.padlabels(Y) y = krotos.padlabels(y) krotos.train(X, Y, x, y, modelfile) clf = RandomForestClassifier(n_estimators=1000) clf.fit(X, Y) pred = np.array(clf.predict(x)) acc = metrics.accuracy_score(y, pred) * 100 return {"status": "ok"}
def load_data(PARAMS, folder, file_list): n_fft = PARAMS['n_fft'][PARAMS['Model']] n_mels = PARAMS['n_mels'][PARAMS['Model']] featName = PARAMS['featName'][PARAMS['Model']] FV = np.empty([], dtype=np.float32) labels_mu = np.empty([], dtype=np.int32) labels_sp = np.empty([], dtype=np.int32) fl_count = 0 for fName in file_list: fl_count += 1 fName_path = folder + '/features/' + fName + '.npy' if not os.path.exists(fName_path): continue fv = np.load(fName_path, allow_pickle=True) fv = get_featuregram(PARAMS, PARAMS['feature_opDir'], fName, fv, n_fft, n_mels, featName, save_feat=True) nFrames = np.shape(fv)[1] annotations_mu, annotations_sp, music_marker, speech_marker = get_annotations( PARAMS['test_path'], fName, nFrames, PARAMS['opDir']) if not 'HarmPerc' in featName: fv = fv.T fv = StandardScaler(copy=False).fit_transform(fv) fv = fv.T else: nDim = np.shape(fv)[0] fv_H = fv[:int(nDim / 2), :] fv_H = fv_H.T fv_H = StandardScaler(copy=False).fit_transform(fv_H) fv_H = fv_H.T fv_P = fv[int(nDim / 2):, :] fv_P = fv_P.T fv_P = StandardScaler(copy=False).fit_transform(fv_P) fv_P = fv_P.T fv = np.append(fv_H.astype(np.float32), fv_P.astype(np.float32), axis=0) if np.size(FV) <= 1: FV = fv.astype(np.float32) labels_mu = music_marker.astype(np.int32) labels_sp = speech_marker.astype(np.int32) else: FV = np.append(FV, fv.astype(np.float32), axis=1) labels_mu = np.append(labels_mu, music_marker.astype(np.int32)) labels_sp = np.append(labels_sp, speech_marker.astype(np.int32)) print(fl_count, '/', len(file_list), fName, np.shape(FV), np.shape(labels_mu), np.shape(labels_sp)) return FV, labels_mu, labels_sp
def create_model(k=29): # load data df = pd.read_csv('BikeShare.csv') df.index = [x for x in range(1, len(df.values) + 1)] # select features for model input X = df[[ 'TripDuration', 'StartStationID', 'StartStationLatitude', 'StartStationLongitude', 'TripDurationinmin' ]].values # select the goal y = df['EndStationName'].values # change the data types X = StandardScaler().fit(X).transform(X.astype(float)) from sklearn.model_selection import train_test_split # split data to sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4) # Train Model and Predict knn = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train) knnPickle = open('knnpickle_file', 'wb') pickle.dump(knn, knnPickle)
class WineQualityDataset(Dataset): # load the dataset def __init__(self, path): # load the csv file as a dataframe df = pd.read_csv(path, delimiter=";") print(f"Rows, columns: {str(df.shape)}") print(df.head) # Create Classification version of target variable df['goodquality'] = [1 if x >= 6 else 0 for x in df['quality']] df = df.drop(['quality'], axis = 1) print(df['goodquality'].value_counts()) # store the inputs and outputs self.X = StandardScaler().fit_transform(df.values[:, :-1]) self.y = df.values[:, -1] # ensure input data is floats self.X = self.X.astype('float32') self.y = self.y.astype('float32') self.y = self.y.reshape((len(self.y), 1)) # number of rows in the dataset def __len__(self): return len(self.X) # get a row at an index def __getitem__(self, idx): return [self.X[idx], self.y[idx]] # get indexes for train and test rows def get_splits(self, n_test=0.33): # determine sizes test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size # calculate the split return random_split(self, [train_size, test_size])
def implement_pca_betweem_two_frames(image1, image2): #read image pic1 = cv2.imread(image1) pic2 = cv2.imread(image2) #convert BGR to Gray prvs = cv2.cvtColor(pic1, cv2.COLOR_BGR2GRAY) next = cv2.cvtColor(pic2, cv2.COLOR_BGR2GRAY) #calculate optical flow flow = cv2.calcOpticalFlowFarneback(prvs, next, None, 0.5, 3, 15, 3, 5, 1.2, 0) #obtain angle matrix: _ is magnitude and angle_matrix is measure by degree now. _, angle_matrix = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True) #implement normal PCA based on the coarse foreground sklearn_pca = sklearnPCA() angle_std = StandardScaler().fit_transform(angle_matrix) sklearn_pca.fit_transform(angle_std) #convert to uint8 pca_implement = angle_std.astype(np.uint8) #write image cv2.imwrite('pca_fore_ground_matrix_' + str(image1) + '.png', pca_implement) #destroy table cv2.destroyAllWindows()
def diabetes(): df = pd.read_csv('diabetes.csv') cdf = df[[ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome' ]] X = df[[ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ]].values X = StandardScaler().fit(X).transform(X.astype(float)) y = df['Outcome'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4) ''' k = 4 neigh = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train) yhat = neigh.predict(X_test) print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) ''' Ks = 10 mean_acc = np.zeros((Ks - 1)) std_acc = np.zeros((Ks - 1)) for n in range(1, Ks): # Train Model and Predict neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train) yhat = neigh.predict(X_test) mean_acc[n - 1] = metrics.accuracy_score(y_test, yhat) std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0]) '''
def get_moons_dataset(n_samples=1000): noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) X, _ = noisy_moons X = StandardScaler().fit_transform(X) dataset = tf.data.Dataset.from_tensor_slices(X.astype(np.float32)) return dataset
def PCA(): dataset = readDataset() #Passo 1: Centralizar os dados em torno do ponto 0. Caso as features possuem unidades de medidas diferentes, devemos dividir o resultado pela standard deviation. scaled = StandardScaler().fit_transform(dataset.X.astype(float)) #Passo 2: Calcular a covariancia da matrix de dados, onde a covariância indica o grau de interdependência númerica entre duas variáveis covMatrix = (np.corrcoef(scaled.astype(float).T)) #Passo 3: Calcular os autovalores e autovetores da matrix de covariancia w, v = np.linalg.eig(covMatrix) #Verificar o quanto de informação pode ser atribuido para cada componente percentage = (w / sum(w)) * 100 print('Informação atribuida para cada componente: ', percentage) eig_pairs = [(np.abs(w[i]), v[:, i]) for i in range(len(w))] # Concatena horizontalmente as features. matrix_w = np.hstack( (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1), eig_pairs[2][1].reshape(4, 1), eig_pairs[3][1].reshape(4, 1))) X = scaled.dot(matrix_w) df = pd.DataFrame(data=X, columns=[ 'Principal component 1', 'Principal component 2', 'Principal component 3', 'Principal component 4' ]) df['target'] = dataset.Y sns.pairplot(data=df, hue='target') plt.show()
def compute_DBSCAN(features): features = StandardScaler().fit_transform(features.astype(float)) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=0.25, min_samples=4).fit(features) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ return core_samples_mask, labels
def apricot_select(data, k, standardize=True, chunksize=20000): """Does 'farthest point sampling' with apricot. For memory limitation reasons it is chunked with a hardcoded chunksize. """ if standardize: print('standardizing data') data = StandardScaler().fit_transform(data) data = data.astype(np.float64) num_chunks = int(data.shape[0] / chunksize) if num_chunks > 1: chunksize = int(data.shape[0] / num_chunks) else: num_chunks = 1 chunksize = len(data) # This assumes shuffled data and is used to make stuff a bit less # memory intensive chunklist = [] to_select = int(k / num_chunks) print(('Will use {} chunks of size {}'.format(num_chunks, chunksize))) num_except = 0 for d_ in tqdm(chunks(data, chunksize)): print(('Current chunk has size {}'.format(len(d_)))) if len(d_) > to_select: # otherwise it makes no sense to select something try: X_subset = FacilityLocationSelection(to_select).fit_transform(d_) chunklist.append(X_subset) except Exception: # pylint:disable=broad-except num_except += 1 if num_except > 1: # pylint:disable=no-else-return warnings.warn( 'Could not perform diverse set selection for two attempts, will perform random choice') return np.random.choice(len(data), k, replace=False) else: print('will use greedy select now') X_subset = _greedy_loop(d_, to_select, 'euclidean') chunklist.append(X_subset) greedy_indices = [] subset = np.vstack(chunklist) print((subset.shape)) for d in subset: index = np.where(np.all(data == d, axis=1))[0][0] greedy_indices.append(index) del data del subset output = list(set(greedy_indices)) print((len(output))) return output
def predict(): import autokeras as ak iterations = 3 uid = request.get_json()['user'] trainsongs = request.get_json()['songs'] trainsongs.sort() reclen = 15 if 'reclen' in request.get_json(): reclen = request.get_json()['reclen'] if 'iterations' in request.get_json(): iterations = request.get_json()['iterations'] print(np.argmax([20, 30])) modelfile = krotos.modelfname(trainsongs, uid) print(modelfile) if os.path.isfile(modelfile): model = load_model(modelfile, custom_objects=ak.CUSTOM_OBJECTS) else: return {"status": "error", "reason": "train first"}, 500 recordings = [] for iteration in range(int(iterations)): recordings.append(krotos.getFeatures(duration=reclen)) predictions = [None] * iterations i = 0 n_features = 12 for recording in recordings: recording = StandardScaler().fit_transform(recording) recording = recording.astype('float32') for row in np.array(recording): row = np.array([row[:n_features]]) prediction = model.predict(row) if predictions[i] is not None: predictions[i] += np.array(prediction) else: predictions[i] = np.array(prediction) i += 1 ret = [] preds = [] for prediction in predictions: ret.append(prediction.tolist()) print(prediction) print(np.argmax(prediction)) preds.append(trainsongs[np.argmax(prediction)]) return { "predictions": ret, "ids": preds, "overall": int(np.bincount(preds).argmax()) }
def __init__(self, X, y): if not torch.is_tensor(X): X = StandardScaler().fit_transform(X) X = X.astype(np.float32) self.X = torch.from_numpy(X) else: raise ValueError("X should be numpy") if not torch.is_tensor(y): y = y.astype(np.float32) self.y = torch.from_numpy(y) else: raise ValueError("y should be numpy")
def test_32_64_decomposition_shape(): """ Test that the decomposition is similar for 32 and 64 bits data """ # see https://github.com/scikit-learn/scikit-learn/issues/18146 X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1) X = StandardScaler().fit_transform(X) X -= X.min() # Compare the shapes (corresponds to the number of non-zero eigenvalues) kpca = KernelPCA() assert (kpca.fit_transform(X).shape == kpca.fit_transform( X.astype(np.float32)).shape)
class Moons(Dataset): def __init__(self, n_samples, shuffle, noise): self.X, self.y = make_moons(n_samples=n_samples, shuffle=shuffle, noise=noise) self.X = StandardScaler().fit_transform(self.X) self.X, self.y = self.X.astype(np.float32), self.y.astype(np.int) def __len__(self): return len(self.X) def __getitem__(self, idx): return torch.from_numpy(self.X[idx]), torch.from_numpy( np.array(self.y[idx]))
def decimate_data(datapath, doplot): df = pd.read_csv(datapath) df.columns = ["V" + str(i) for i in range(1, len(df.columns) + 1)] df.V1 = df.V1.astype(str) X = df.loc[:, "V2":] # independent variables data y = df.V1 # dependent variable data print "Initial number of samples: " + str(len(df)) encoder = preprocessing.LabelEncoder() """ Initial Random Sampling: takes 20% sample of total input sample """ pctg = 0.20 print "Random sampling rate: " + str(pctg * 100) + "%" sample_len = int(len(df) * pctg) random_sample = X.take(np.random.permutation(len(df))[:sample_len]) random_sample_encoded = random_sample.apply(encoder.fit_transform) print "Number of random samples: " + str(len(random_sample)) """ K-means clustering """ x = np.array(random_sample_encoded) x = x.astype(int) ks = range(1, 16) kmeans = [KMeans(n_clusters=i, random_state=0) for i in ks] score = [kmeans[i].fit(x).score(x) for i in range(len(kmeans))] score = [-score[i] for i in range(len(ks))] """Plot for evaluating the Elbow for k-Means clustering""" if doplot: colors = np.random.rand(100) plt.suptitle("Elbow Plot", fontsize=14, fontweight='bold') plt.scatter(ks, score, c=colors, alpha=0.5) plt.plot(ks, score) plt.ylabel('Objective Function Value') plt.xlabel('Number of clusters') plt.show() """ from elbow plot, elbow is found at k = 4 next do the stratified sampling on those 4 clusters. """ k_elbow = 4 decimated_data = stratified_sampling(kmeans[k_elbow - 1], x) standard_data = decimated_data[:, :-1] standard_data = StandardScaler().fit_transform(standard_data.astype(float)) cluster_id_col = decimated_data[:, -1:] return standard_data, cluster_id_col
def predict(self, loc, time): rec = self.cur.execute( "SELECT StartStationID,StartStationLatitude, StartStationLongitude FROM BikeShare WHERE StartStationName like '" + loc + "' LIMIT 1").fetchone() if not rec: return -1 sample = [time * 60, rec[0], rec[1], rec[2], time] from numpy import asarray df = pd.read_csv('BikeShare.csv') df.index = [x for x in range(1, len(df.values) + 1)] X = df[[ 'TripDuration', 'StartStationID', 'StartStationLatitude', 'StartStationLongitude', 'TripDurationinmin' ]].values record = asarray(sample).reshape(1, -1) record = StandardScaler().fit(X).transform(record.astype(float)) knn = pickle.load(open('knnpickle_file', 'rb')) pred = knn.predict(record) return pred[0]
def Get_onewell_Data(window_size): '''返回某口井的数据进行预测''' filepath = "D:\投的文章\Paper_基于1DCNN的岩相分类\图鉴\井的岩相\\57-04-0-5.txt" data = np.loadtxt(filepath, skiprows=1, dtype=str) attri = data[:, 1:-1].astype(float) attri = StandardScaler().fit_transform(attri) data[:, 1:-1] = attri.astype(str) '''得到每个窗口的数据''' data_list = [] depth_list = [] for y in range(window_size, len(data) - window_size): w_data = data[y - window_size:y + window_size + 1] attri = w_data[:, 1:-1].T.astype(float) label = w_data[window_size, -1].astype(float) depth = w_data[window_size, 0] data_list.append(attri) depth_list.append(depth) attri = np.array(data_list) attri = torch.tensor(attri) return attri, np.row_stack(depth_list)
def cluster(self): cluster_file = open("cluster.txt", "w") print(self.data.shape) X = self.data X = StandardScaler().fit_transform(X) db = DBSCAN(eps=10, min_samples=2).fit(X) labels = db.labels_ print(labels) # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) index = [] fitness = [] mean = [] print("number of estimated clusters : %d" % n_clusters_ ) cluster_file.write("number of estimated clusters : %d" % n_clusters_ + "\n") for k in range(n_clusters_): my_members = (labels == k) for i in range(len(X)): if my_members[i]: index += [i] if self.pure_data != None: num = 0 if i % 2 == 0: num = int(i/2) fitness += [self.pure_data[num].fitness.values] else : num = int((i-1)/2) fitness += [self.pure_data[num].fitness.values] if fitness != []: for i in range(len(fitness[0])): mean += [statistics.mean([ind[i] for ind in fitness])] cluster_file.write("index:"+ "\n") cluster_file.write(str(index) + "\n") cluster_file.write("fitness:"+ "\n") cluster_file.write(str(fitness)+ "\n") cluster_file.write("mean fitness:"+ "\n") cluster_file.write(str(mean)+ "\n") cluster_file.write("members:"+ "\n") cluster_file.write(str(X[my_members])+ "\n") print(index) print("members:") print(X[my_members]) print("fitness:"+ "\n") print(str(fitness)+ "\n") index = [] fitness = [] mean = [] mds = MDS(n_components=2) pos = mds.fit_transform(X.astype(np.float64)) import matplotlib.pyplot as plt colors = list('bgrcmykbgrcmykbgrcmykbgrcmyk') plt.figure(2) for i in range(len(pos[:,0])): plt.plot(pos[i, 0], pos[i, 1], 'o', markerfacecolor=colors[labels[i]], markeredgecolor='k') import matplotlib.pyplot as plt from itertools import cycle plt.figure(1) plt.title("number of estimated clusters : %d" % n_clusters_) colors = list('bgrcmykbgrcmykbgrcmykbgrcmyk') colors_cluster = [colors[labels[i]] for i in range(len(X))] k = [i for i in range(len(X))] for j in range(9): plt.subplot(330 + j) plt.ylabel(label[j]) #plt.ylim(limits[j][0], limits[j][1]) plt.bar(k, X[:][j], color=colors_cluster) plt.show()
import numpy as np #from sklearn.cluster import DBSCAN #from sklearn import metrics from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler ############################################################################## # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) X = StandardScaler().fit_transform(X) X = X.astype(np.float32) ############################################################################## # Compute DBSCAN import dbscan labels = np.array(dbscan.dbscan(X, "sparse").run(0.3, 10)) core_samples_mask = np.zeros_like(labels, dtype=bool) # core_samples_mask[db.core_sample_indices_] = True # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) ############################################################################## # Plot result
class LogregClassifier: def __init__(self, lambd=1e-4): self.lambd = lambd def build(self, optimizer): x, y = self.inputs() pred = self.inference(x) loss, acc = self.loss(pred, y) train_op = self.train_op(loss, optimizer) self.ops = { 'x': x, 'y': y, 'pred': pred, #'loss': self.ema.average(loss), #'acc': self.ema.average(acc), 'loss': loss, 'acc': acc, 'train_op': train_op } return self.ops def inputs(self): x = tf.placeholder(tf.float32, shape=[None, self.X.shape[1]]) y = tf.placeholder(tf.int32, shape=[None]) return x, y def inference(self, x): with tf.variable_scope('logreg_scope') as self.scope: pred = tf.layers.dense(x, 10) return pred def loss(self, logits, y): loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=y, logits=logits), axis=-1) loss += tf.add_n([ tf.nn.l2_loss(v) for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope.name) if 'bias' not in v.name ]) * self.lambd p = tf.cast(tf.argmax(tf.nn.softmax(logits), axis=1), tf.int32) acc = tf.reduce_mean(tf.cast(tf.equal(p, y), tf.float32)) self.ema = tf.train.ExponentialMovingAverage(decay=0.95) self.average_op = self.ema.apply([loss, acc]) return loss, acc def train_op(self, loss, optimizer='adam'): if optimizer == 'adam': optimizer = tf.train.AdamOptimizer(1e-3) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope.name) train_op = optimizer.minimize(loss, var_list=all_vars) with tf.control_dependencies([train_op]): train_op = tf.group(self.average_op) return train_op def prepare_data(self, dataset_name): self.dataset_name = dataset_name if dataset_name == 'digits': dataset = load_digits(n_class=10) elif dataset_name == 'mnist': dataset = fetch_mldata('MNIST original', data_home='/srv/hd1/data/vyanush/') self.X, self.Y = dataset.data, dataset.target self.X, self.Y = utils.shuffle(self.X, self.Y) self.X = StandardScaler().fit_transform(self.X.astype(np.float32)) if dataset_name == 'mnist': self.X_train = self.X[:50000] self.Y_train = self.Y[:50000] self.X_val = self.X[50000:] self.Y_val = self.Y[50000:] def batch_iterator(self, n_epochs, batch_size): for epoch in range(n_epochs): indices = np.arange(self.X_train.shape[0]) np.random.shuffle(indices) for pos in range(0, self.X_train.shape[0] - batch_size + 1, batch_size): ind = indices[pos:pos + batch_size] yield self.X_train[ind], self.Y_train[ind]
path = '' for i in range(0, 15): training_nr = 30 * i csvfilename = path + str(training_nr) + '' outputfile = str(training_nr) + '' # Load training set test_set = loadTestSet(csvfilename) X, Y = test_set.loadTestSet() #X = MinMaxScaler().fit_transform(X) X = StandardScaler().fit_transform(X) X_df = pd.DataFrame(data=X.astype(float)) Y_df = pd.DataFrame(data=Y.astype(int)) result = pd.concat([X_df, Y_df], axis=1) result.to_csv(outputfile, sep=';', header=[ 'Band_6_0406_Mean', 'Band_7_1104_Mean', 'Band_8_0406_Mean', 'Band_8A_0406_Mean', 'Band_11_0406_Mean', 'Band_12_0406_Mean', 'AfgKode' ], float_format='%.10f', index=False) print "Done"
def Load_Data(ipath): data = np.loadtxt(ipath, skiprows=1, dtype=str) attri = data[:, 3:-1].astype(float) attri = StandardScaler().fit_transform(attri) data[:, 3:-1] = attri.astype(str) return data
def get_test_blobs(n_samples=1000, d=2): centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.1, random_state=0) X = StandardScaler().fit_transform(X) X = X.astype(np.float32) return X
class DIGITSClassifier(optimizee.Optimizee): name = 'digits_classifier' def __init__(self, num_units=20, num_layers=1, dataset_name='digits', activation='sigmoid', return_func=False): super(DIGITSClassifier, self).__init__() self.dataset_name = dataset_name if dataset_name == 'digits': dataset = load_digits(n_class=10) elif dataset_name == 'mnist': dataset = fetch_mldata('MNIST original', data_home='/srv/hd1/data/vyanush/') elif dataset_name == 'random': num_features = np.random.randint(low=1, high=100) data_size = np.random.randint(low=100, high=1000) w = np.random.normal(size=num_features) w0 = np.random.normal(size=1, scale=0.1) X = np.random.normal(size=(data_size, num_features)) Y = X.dot(w) + w0 > 0 dataset = Dataset(X, Y) self.X, self.Y = dataset.data, dataset.target self.X, self.Y = utils.shuffle(self.X, self.Y) self.X = StandardScaler().fit_transform(self.X.astype(np.float32)) self.num_units = num_units self.num_layers = num_layers self.activation = activation self.return_func = return_func self.x_len = 0 self.x_len_counted = False def get_x_dim(self): return self.dim def build(self): with tf.variable_scope('digits_classifier'): self.dim = tf.placeholder(tf.int32, [], name='dim') self.x = tf.placeholder( tf.float32, [None, None, None, self.X.shape[1]], name='X' ) # n_bptt_steps * batch_size * data_size * num_features self.y = tf.placeholder(tf.int32, [None, None, None], name='y') def loss(self, x, i): self.coord_pos = 0 self.coord_vector = x dims = [self.num_units] * self.num_layers # self.x[i].shape == (batch_size, data_size, n_inputs) #pred = tf.transpose(self.x[i], perm=[0, 2, 1]) pred = self.x[i][0] activation = getattr(tf.nn, self.activation) with tf.variable_scope( 'nn_classifier/loss', custom_getter=self.custom_getter) as scope, tf.device( '/gpu:0'): for n_outputs in dims: pred = tf.layers.dense(pred, n_outputs, activation=None) pred = tf.layers.batch_normalization(pred) pred = activation(pred) pred = tf.layers.dense(pred, 10) #pred = tf.transpose(pred, perm=[0, 2, 1]) # shape = (batch_size, data_size, n_classes) f = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.y[i][0], logits=pred), axis=-1) p = tf.argmax(tf.nn.softmax(pred), axis=-1) print(p.get_shape(), self.y[i][0].get_shape()) #acc = tf.reduce_mean(tf.cast(tf.equal(tf.cast(p, tf.int32), self.y[i][0]), tf.float32), axis=1) acc = tf.reduce_mean(tf.cast( tf.equal(tf.cast(p, tf.int32), self.y[i][0]), tf.float32), axis=-1) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): g = self.grad(x, f) if not self.x_len_counted: self.x_len = self.coord_pos self.x_len_counted = True self.vars_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope.name) if self.return_func: return f, g else: return acc, g def get_initial_x(self, batch_size=1): if self.dataset_name == 'mnist': self.batch_size = np.random.randint(low=1, high=256) else: self.batch_size = np.random.randint(low=1, high=self.X.shape[0] // 4 + 1) self.s = 0 print("{} classifier; batch_size: {}".format(self.dataset_name, self.batch_size)) #w = np.random.normal(0, 0.01, size=(batch_size, self.x_len)) w = np.zeros(self.x_len) print("x_len: ", w.shape) for name, d in self.coord_vars.items(): start, end = d['pos'] #with tf.variable_scope('dummy_{}'.format(name), reuse=False): # shape = d['shape'] # #print(type(shape[0]), type(shape[1])) # init = (d['initializer'] or glorot_uniform_initializer(dtype=tf.float32))(shape) # dummy = tf.get_variable('dummy', initializer=init) # dummy.initializer.run() dummy = (d['initializer'] or glorot_uniform_initializer(dtype=tf.float32))( d['shape']) val = tf.get_default_session().run(dummy) w[start:end] = val.reshape(-1) return w[None, :] def get_new_params(self, batch_size=1): return {self.dim: self.x_len} def get_next_dict(self, n_bptt_steps, batch_size=1): x = np.zeros((n_bptt_steps, 1, self.batch_size, self.X.shape[1])) y = np.zeros((n_bptt_steps, 1, self.batch_size)) for i in range(n_bptt_steps): if self.s + self.batch_size > self.X.shape[0]: self.s = 0 pos_cur, pos_next = self.s, self.s + self.batch_size pos_cur = np.random.randint(low=0, high=self.X.shape[0] - self.batch_size) pos_next = pos_cur + self.batch_size x[i] = np.tile(self.X[None, pos_cur:pos_next], (batch_size, 1, 1)) y[i] = np.tile(self.Y[None, pos_cur:pos_next], (batch_size, 1, 1)) self.s = pos_next return { self.x: x, self.y: y, }
min_count=0, sg=1, iter=1, workers=multiprocessing.cpu_count()) wv = w2v.wv A = [wv[str(i)] for i in range(num_list[-1])] np.save("../%s_wv_%d_%s.npy" % (args.data, args.dimensions, args.walk), A) from sklearn.preprocessing import StandardScaler A = StandardScaler().fit_transform(A) A = np.concatenate( (np.zeros((1, A.shape[-1]), dtype='float32'), A), axis=0) A = A.astype('float32') A = torch.tensor(A).to(device) print (A.shape) node_embedding = Wrap_Embedding(int( num_list[-1] + 1), args.dimensions, scale_grad_by_freq=False, padding_idx=0, sparse=False) node_embedding.weight = nn.Parameter(A) elif args.feature == 'adj': flag = False node_embedding = MultipleEmbedding( embeddings_initial, bottle_neck, flag, num_list).to(device)
from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score import pandas as pd # READ DATA df = pd.read_csv("teleCust1000t.csv") # SPLIT INTO LABELS AND FEATURES X = df[[ 'region', 'tenure', 'age', 'marital', 'address', 'income', 'ed', 'employ', 'retire', 'gender', 'reside' ]].values Y = df[['custcat']].values # FEATURE NORMALISATION X = StandardScaler().fit(X).transform(X.astype(float)) Y = Y.reshape(1000) # DATA SPLITTING train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=4) # CREATING MODEL model = KNeighborsClassifier(n_neighbors=5) model.fit(train_x, train_y) prediction = model.predict(test_x) print("Accuracy of a model is ", accuracy_score(test_y, prediction))
#out = torch.clamp(out, 1.0, 5.0) return out df = pd.read_csv(inputFileName, sep=",") df_percent = df.sample(frac=1).reset_index(drop=True).sample(frac=subsetFrac) train = df_percent.sample(frac=1.0 - testFrac) test = df_percent.drop(train.index) train_labels = torch.tensor( np.expand_dims(train['Stars'].values.astype(np.float32), axis=1)) train_temp = train.drop('Stars', axis=1) if dropLastN == 0 else train.drop( 'Stars', axis=1).iloc[:, :-dropLastN] # train_norm = StandardScaler().fit_transform(train_temp) train_features = torch.tensor(train_norm.astype(np.float32)) train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset( train_features, train_labels), batch_size=batch_size, shuffle=False, pin_memory=True) test_labels = torch.tensor( np.expand_dims(test['Stars'].values.astype(np.float32), axis=1)) test_temp = test.drop('Stars', axis=1) if dropLastN == 0 else test.drop( 'Stars', axis=1).iloc[:, :-dropLastN] # test_norm = StandardScaler().fit_transform(test_temp) test_features = torch.tensor(test_norm.astype(np.float32)) test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset( test_features, test_labels), batch_size=batch_size,
###### for dataset in ['biodeg.csv_header','voice.csv']: print("Working on",dataset,"data set...") data_df = pd.read_csv(dataset) if dataset == "biodeg.csv_header": dataX = data_df.iloc[:,:41] dataY = data_df.iloc[:,41] dataset = "QSAR" comps = np.int32(np.linspace(2, 40,20)) else: dataX = data_df.iloc[:,:20] dataY = data_df.iloc[:,20] dataset = "VOICE" comps = np.int32(np.linspace(2, 20,20)) dataX = StandardScaler().fit_transform(dataX.astype('float64')) ####################################################### split = train_test_split(dataX, dataY, test_size = 0.3, random_state = 42) (trainData, testData, trainTarget, testTarget) = split model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) print("Running RP...") accuracies = [] for comp in comps: # create the random projection #sp = SparseRandomProjection(n_components = comp)
train = train.drop(['date'], axis=1) test = test.drop(['date', 'S1'], axis=1) y = train['S1'].astype(np.float32) train = train.drop(['S1'], axis=1) ######## check whether the data is linear################ #plt.scatter(train['S7'], train['S1']) #plt.xlabel('S7') #plt.ylabel('S1') #plt.show() ######### PRE PROCESSING ######## train = StandardScaler().fit_transform(train) train = train.astype(np.float32) test = StandardScaler().fit_transform(test) test = test.astype(np.float32) ######## RF MODEL ######## rf = RandomForestRegressor(n_estimators=150, max_depth=15) rf.fit(train, y) ######## FEATURE IMPORTANCE ######## importances = rf.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(train.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
def patch_probability_generator(PARAMS, fl, Train_Params): startTime = time.process_time() labels_sp = [] labels_mu = [] pred_opDir = PARAMS['opDir'] + '/__Frame_Predictions_CNN/' if not os.path.exists(pred_opDir): os.makedirs(pred_opDir) result_fName = fl + '_fold' + str(PARAMS['fold']) + '_result' n_fft = PARAMS['n_fft'][PARAMS['Model']] n_mels = PARAMS['n_mels'][PARAMS['Model']] featName = PARAMS['featName'][PARAMS['Model']] if not os.path.exists(pred_opDir + result_fName + '.pkl'): fName_path = PARAMS['test_path'] + '/features/' + fl + '.npy' if not os.path.exists(fName_path): return {} fv = np.load(fName_path, allow_pickle=True) fv = get_featuregram(PARAMS, PARAMS['feature_opDir'], fl, fv, n_fft, n_mels, featName, save_feat=True) if not 'HarmPerc' in featName: fv = fv.T fv = StandardScaler(copy=False).fit_transform(fv) fv = fv.T else: nDim = np.shape(fv)[0] fv_H = fv[:int(nDim / 2), :] fv_H = fv_H.T fv_H = StandardScaler(copy=False).fit_transform(fv_H) fv_H = fv_H.T fv_P = fv[int(nDim / 2):, :] fv_P = fv_P.T fv_P = StandardScaler(copy=False).fit_transform(fv_P) fv_P = fv_P.T fv = np.append(fv_H.astype(np.float32), fv_P.astype(np.float32), axis=0) nFrames = np.shape(fv)[1] annotations_mu, annotations_sp, music_marker, speech_marker = get_annotations( PARAMS['test_path'], fl, nFrames, PARAMS['opDir']) pred = np.empty([]) pred_lab = np.empty([]) batch_size = 10000 labels_mu = [] labels_sp = [] # for batchStart in range(0, np.shape(fv_patches)[0], batch_size): for batchStart in range(0, np.shape(fv)[1], batch_size): # batchEnd = np.min([batchStart+batch_size, np.shape(fv_patches)[0]]) batchEnd = np.min([batchStart + batch_size, np.shape(fv)[1]]) # fv_patches_temp = fv_patches[batchStart:batchEnd,:] fv_temp = fv[:, batchStart:batchEnd] music_marker_temp = music_marker[batchStart:batchEnd] speech_marker_temp = speech_marker[batchStart:batchEnd] print('\tBatch: (', batchStart, batchEnd, ') ', np.shape(fv_temp), ' mu=', np.sum(music_marker_temp), ' sp=', np.sum(speech_marker_temp), end=' ', flush=True) fv_patches_temp = get_feature_patches(PARAMS, fv_temp, PARAMS['W'], PARAMS['W_shift_test'], featName) labels_mu_patches = cextract_patches( np.array(music_marker_temp, ndmin=2), np.shape(np.array(music_marker_temp, ndmin=2)), PARAMS['W'], PARAMS['W_shift_test']).astype(int) labels_mu_temp = ( (np.sum(np.squeeze(labels_mu_patches, axis=1), axis=1) / np.shape(labels_mu_patches)[2]) > 0.5).astype(int) labels_sp_patches = cextract_patches( np.array(speech_marker_temp, ndmin=2), np.shape(np.array(speech_marker_temp, ndmin=2)), PARAMS['W'], PARAMS['W_shift_test']).astype(int) labels_sp_temp = ( (np.sum(np.squeeze(labels_sp_patches, axis=1), axis=1) / np.shape(labels_sp_patches)[2]) > 0.5).astype(int) if 'Lemaire_et_al' in PARAMS['Model']: # TCN input shape=(batch_size, timesteps, ndim) fv_patches_temp = np.transpose(fv_patches_temp, axes=(0, 2, 1)) if PARAMS['signal_type'] == 'music': pred_temp = Train_Params['model'].predict(x=fv_patches_temp) CM, acc, P, R, F1 = getPerformance( np.array((pred_temp > 0.5).astype(int)), labels_mu_temp) elif PARAMS['signal_type'] == 'speech': pred_temp = Train_Params['model'].predict(x=fv_patches_temp) CM, acc, P, R, F1 = getPerformance( np.array((pred_temp > 0.5).astype(int)), labels_sp_temp) pred_lab_temp = np.array(pred_temp > 0.5).astype(int) if np.size(pred) <= 1: pred = pred_temp pred_lab = pred_lab_temp else: pred = np.append(pred, pred_temp) pred_lab = np.append(pred_lab, pred_lab_temp) labels_mu.extend(labels_mu_temp) labels_sp.extend(labels_sp_temp) print(np.shape(fv_patches_temp), np.shape(pred_temp), np.shape(pred), ' acc=', acc, F1) if PARAMS['signal_type'] == 'music': ConfMat, precision, recall, fscore = misc.getPerformance( pred_lab, labels_mu, labels=[0, 1]) acc = np.round(np.sum(np.diag(ConfMat)) / np.sum(ConfMat), 4) print('Perf mu: ', acc, precision, recall, fscore) elif PARAMS['signal_type'] == 'speech': ConfMat, precision, recall, fscore = misc.getPerformance( pred_lab, labels_sp, labels=[0, 1]) acc = np.round(np.sum(np.diag(ConfMat)) / np.sum(ConfMat), 4) print('Perf sp: ', acc, precision, recall, fscore) print('\n\n\n') probability_genTime = time.process_time() - startTime result = { 'pred': pred, 'pred_lab': pred_lab, 'labels_sp': labels_sp, 'labels_mu': labels_mu, 'probability_genTime': probability_genTime, 'ConfMat': ConfMat, 'precision': precision, 'recall': recall, 'fscore': fscore, 'accuracy': acc, } misc.save_obj(result, pred_opDir, result_fName) print('Test predictions saved!!!') else: result = misc.load_obj(pred_opDir, result_fName) return result
# # Generate sample data for the DBSCAN test # # Lifted from http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#example-cluster-plot-dbscan-py # import numpy as np from sklearn.cluster import DBSCAN from sklearn import metrics from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) X = StandardScaler().fit_transform(X) X = X.astype(np.float64) db = DBSCAN(eps=0.3, min_samples=10, metric='l2', algorithm='brute').fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ with open('dbscan.csv', 'w') as fscanout: with open('dbscan_labels.csv', 'w') as fscanlabout: for i in range(750): fscanout.write(",".join([str(x) for x in X[i,:]]) + "\n") fscanlabout.write(str(labels[i]) + "\n")
path = '/Users/zhangweijian01/Downloads/data.csv' ori_data = pd.read_csv(path, header=0, sep='\t') y_data = ori_data['Y'] x_data = ori_data.ix[:, 3:] x_data = x_data.fillna(x_data.mean()) y_data = y_data.fillna(y_data.mean()) # to handle missing values imp = Imputer(missing_values='NaN', strategy='median', axis=0) imp.fit(x_data) data_imp = imp.transform(x_data) x_scaler = data_imp # scalar x_scaler = StandardScaler().fit_transform(data_imp) x_scaler = x_scaler.astype(np.float64, copy=False) for i in range(0, len(x_scaler)): for j in range(0, len(x_scaler[i])): x_scaler[i][j] = float('%.4f' % (x_scaler[i][j])) # sava preprocessed data to file np.savetxt("newdata2.csv", x_scaler, delimiter=",") f = open('newdata1.csv', 'w') for i in range(0, len(x_scaler)): line = str(y_data[i]) for j in range(0, len(x_scaler[i])): line = line + ',' + str(x_scaler[i][j]) line += '\n' f.write(line)