Python StandardScaler.astype 예제들, sklearn.preprocessing.StandardScaler.astype Python 예제들

예제 #1

0

파일 보기

def train():
    uid = request.get_json()['user']
    trainsongs = request.get_json()['songs']
    trainsongs.sort()

    modelfile = krotos.modelfname(trainsongs, uid)

    conn = engine.connect()
    df = pd.read_sql(
        select(selectsamples12).where(samples.c.testdata == False), conn)
    testdf = pd.read_sql(
        select(selectsamples12).where(samples.c.testdata == True), conn)
    df = df.loc[df['songid'].isin(trainsongs)]
    testdf = testdf.loc[testdf['songid'].isin(trainsongs)]
    X, Y = df.values[:, 1:], df.values[:, 0]
    x, y = testdf.values[:, 1:], testdf.values[:, 0]
    X = StandardScaler().fit_transform(X)
    x = StandardScaler().fit_transform(x)
    X = X.astype('float32')
    x = x.astype('float32')
    Y = LabelEncoder().fit_transform(Y)
    y = LabelEncoder().fit_transform(y)
    Y = krotos.padlabels(Y)
    y = krotos.padlabels(y)

    acc = krotos.train(X, Y, x, y, modelfile)
    if acc:
        return {"status": "ok", "modelfile": modelfile, "accuracy": acc}
    else:
        return {"status": "not ok"}, 500

예제 #2

0

파일 보기

def comparealgos():
    from sklearn import metrics
    from sklearn.preprocessing import MinMaxScaler
    import autokeras as ak

    trainsongs = [1, 2]
    uid = 1
    modelfile = krotos.modelfname(trainsongs, uid)
    conn = engine.connect()
    df = pd.read_sql(
        select(selectsamples12).where(samples.c.testdata == False), conn)
    testdf = pd.read_sql(
        select(selectsamples12).where(samples.c.testdata == True), conn)
    df = df.loc[df['songid'].isin(trainsongs)]
    testdf = testdf.loc[testdf['songid'].isin(trainsongs)]
    X, Y = df.values[:, 1:], df.values[:, 0]
    x, y = testdf.values[:, 1:], testdf.values[:, 0]
    X = StandardScaler().fit_transform(X)
    x = StandardScaler().fit_transform(x)
    X = X.astype('float32')
    x = x.astype('float32')
    Y = LabelEncoder().fit_transform(Y)
    y = LabelEncoder().fit_transform(y)

    Y = krotos.padlabels(Y)
    y = krotos.padlabels(y)
    krotos.train(X, Y, x, y, modelfile)

    clf = RandomForestClassifier(n_estimators=1000)
    clf.fit(X, Y)
    pred = np.array(clf.predict(x))
    acc = metrics.accuracy_score(y, pred) * 100

    return {"status": "ok"}

예제 #3

0

파일 보기

def load_data(PARAMS, folder, file_list):
    n_fft = PARAMS['n_fft'][PARAMS['Model']]
    n_mels = PARAMS['n_mels'][PARAMS['Model']]
    featName = PARAMS['featName'][PARAMS['Model']]
    FV = np.empty([], dtype=np.float32)
    labels_mu = np.empty([], dtype=np.int32)
    labels_sp = np.empty([], dtype=np.int32)
    fl_count = 0
    for fName in file_list:
        fl_count += 1
        fName_path = folder + '/features/' + fName + '.npy'
        if not os.path.exists(fName_path):
            continue
        fv = np.load(fName_path, allow_pickle=True)
        fv = get_featuregram(PARAMS,
                             PARAMS['feature_opDir'],
                             fName,
                             fv,
                             n_fft,
                             n_mels,
                             featName,
                             save_feat=True)
        nFrames = np.shape(fv)[1]
        annotations_mu, annotations_sp, music_marker, speech_marker = get_annotations(
            PARAMS['test_path'], fName, nFrames, PARAMS['opDir'])
        if not 'HarmPerc' in featName:
            fv = fv.T
            fv = StandardScaler(copy=False).fit_transform(fv)
            fv = fv.T
        else:
            nDim = np.shape(fv)[0]
            fv_H = fv[:int(nDim / 2), :]
            fv_H = fv_H.T
            fv_H = StandardScaler(copy=False).fit_transform(fv_H)
            fv_H = fv_H.T
            fv_P = fv[int(nDim / 2):, :]
            fv_P = fv_P.T
            fv_P = StandardScaler(copy=False).fit_transform(fv_P)
            fv_P = fv_P.T
            fv = np.append(fv_H.astype(np.float32),
                           fv_P.astype(np.float32),
                           axis=0)
        if np.size(FV) <= 1:
            FV = fv.astype(np.float32)
            labels_mu = music_marker.astype(np.int32)
            labels_sp = speech_marker.astype(np.int32)
        else:
            FV = np.append(FV, fv.astype(np.float32), axis=1)
            labels_mu = np.append(labels_mu, music_marker.astype(np.int32))
            labels_sp = np.append(labels_sp, speech_marker.astype(np.int32))
        print(fl_count, '/', len(file_list), fName, np.shape(FV),
              np.shape(labels_mu), np.shape(labels_sp))
    return FV, labels_mu, labels_sp

예제 #4

0

파일 보기

파일: mybackend.py 프로젝트: Serfati/velodrome

def create_model(k=29):
    # load data
    df = pd.read_csv('BikeShare.csv')
    df.index = [x for x in range(1, len(df.values) + 1)]
    # select features for model input
    X = df[[
        'TripDuration', 'StartStationID', 'StartStationLatitude',
        'StartStationLongitude', 'TripDurationinmin'
    ]].values
    # select the goal
    y = df['EndStationName'].values
    # change the data types
    X = StandardScaler().fit(X).transform(X.astype(float))
    from sklearn.model_selection import train_test_split
    # split data to sets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=4)

    # Train Model and Predict
    knn = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)

    knnPickle = open('knnpickle_file', 'wb')

    pickle.dump(knn, knnPickle)

예제 #5

0

파일 보기

파일: wine-quality_ltorch.py 프로젝트: scalingpythonml/scalingpythonml

class WineQualityDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = pd.read_csv(path, delimiter=";")
        print(f"Rows, columns: {str(df.shape)}")
        print(df.head)
        # Create Classification version of target variable
        df['goodquality'] = [1 if x >= 6 else 0 for x in df['quality']]
        df = df.drop(['quality'], axis = 1)
        print(df['goodquality'].value_counts())
        # store the inputs and outputs
        self.X = StandardScaler().fit_transform(df.values[:, :-1])
        self.y = df.values[:, -1]
        # ensure input data is floats
        self.X = self.X.astype('float32')
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))

    # number of rows in the dataset
    def __len__(self):
        return len(self.X)

    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

예제 #6

0

파일 보기

파일: BS_pca.py 프로젝트: YingnanMa/Background_Subtraction_with_a_Freely_Moving_Camera

def implement_pca_betweem_two_frames(image1, image2):

    #read image
    pic1 = cv2.imread(image1)
    pic2 = cv2.imread(image2)

    #convert BGR to Gray
    prvs = cv2.cvtColor(pic1, cv2.COLOR_BGR2GRAY)
    next = cv2.cvtColor(pic2, cv2.COLOR_BGR2GRAY)

    #calculate optical flow
    flow = cv2.calcOpticalFlowFarneback(prvs, next, None, 0.5, 3, 15, 3, 5,
                                        1.2, 0)

    #obtain angle matrix: _ is magnitude and angle_matrix is measure by degree now.
    _, angle_matrix = cv2.cartToPolar(flow[..., 0],
                                      flow[..., 1],
                                      angleInDegrees=True)

    #implement normal PCA based on the coarse foreground
    sklearn_pca = sklearnPCA()
    angle_std = StandardScaler().fit_transform(angle_matrix)
    sklearn_pca.fit_transform(angle_std)

    #convert to uint8
    pca_implement = angle_std.astype(np.uint8)

    #write image
    cv2.imwrite('pca_fore_ground_matrix_' + str(image1) + '.png',
                pca_implement)

    #destroy table
    cv2.destroyAllWindows()

예제 #7

0

파일 보기

파일: diabetes.py 프로젝트: suzachary/data-for-doctors

def diabetes():
    df = pd.read_csv('diabetes.csv')
    cdf = df[[
        'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
        'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
    ]]
    X = df[[
        'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
        'BMI', 'DiabetesPedigreeFunction', 'Age'
    ]].values
    X = StandardScaler().fit(X).transform(X.astype(float))
    y = df['Outcome'].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=4)
    '''
    k = 4
    neigh = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
    yhat = neigh.predict(X_test)
    print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
    print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
    '''
    Ks = 10
    mean_acc = np.zeros((Ks - 1))
    std_acc = np.zeros((Ks - 1))
    for n in range(1, Ks):
        # Train Model and Predict
        neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
        yhat = neigh.predict(X_test)
        mean_acc[n - 1] = metrics.accuracy_score(y_test, yhat)

        std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])
    '''

예제 #8

0

파일 보기

def get_moons_dataset(n_samples=1000):
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
    X, _ = noisy_moons
    X = StandardScaler().fit_transform(X)

    dataset = tf.data.Dataset.from_tensor_slices(X.astype(np.float32))
    return dataset

예제 #9

0

파일 보기

def PCA():
    dataset = readDataset()

    #Passo 1: Centralizar os dados em torno do ponto 0. Caso as features possuem unidades de medidas diferentes, devemos dividir o resultado pela standard deviation.
    scaled = StandardScaler().fit_transform(dataset.X.astype(float))

    #Passo 2: Calcular a covariancia da matrix de dados, onde a covariância indica o grau de interdependência númerica entre duas variáveis
    covMatrix = (np.corrcoef(scaled.astype(float).T))

    #Passo 3: Calcular os autovalores e autovetores da matrix de covariancia
    w, v = np.linalg.eig(covMatrix)

    #Verificar o quanto de informação pode ser atribuido para cada componente
    percentage = (w / sum(w)) * 100
    print('Informação atribuida para cada componente: ', percentage)

    eig_pairs = [(np.abs(w[i]), v[:, i]) for i in range(len(w))]

    # Concatena horizontalmente as features.
    matrix_w = np.hstack(
        (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1),
         eig_pairs[2][1].reshape(4, 1), eig_pairs[3][1].reshape(4, 1)))

    X = scaled.dot(matrix_w)

    df = pd.DataFrame(data=X,
                      columns=[
                          'Principal component 1', 'Principal component 2',
                          'Principal component 3', 'Principal component 4'
                      ])
    df['target'] = dataset.Y
    sns.pairplot(data=df, hue='target')
    plt.show()

예제 #10

0

파일 보기

def compute_DBSCAN(features):
    features = StandardScaler().fit_transform(features.astype(float))
    # #############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=0.25, min_samples=4).fit(features)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    return core_samples_mask, labels

예제 #11

0

파일 보기

def apricot_select(data, k, standardize=True, chunksize=20000):
    """Does 'farthest point sampling' with apricot.
    For memory limitation reasons it is chunked with a hardcoded chunksize. """
    if standardize:
        print('standardizing data')
        data = StandardScaler().fit_transform(data)

    data = data.astype(np.float64)

    num_chunks = int(data.shape[0] / chunksize)

    if num_chunks > 1:
        chunksize = int(data.shape[0] / num_chunks)
    else:
        num_chunks = 1
        chunksize = len(data)

    # This assumes shuffled data and is used to make stuff a bit less
    # memory intensive
    chunklist = []

    to_select = int(k / num_chunks)

    print(('Will use {} chunks of size {}'.format(num_chunks, chunksize)))
    num_except = 0

    for d_ in tqdm(chunks(data, chunksize)):
        print(('Current chunk has size {}'.format(len(d_))))
        if len(d_) > to_select:  # otherwise it makes no sense to select something
            try:
                X_subset = FacilityLocationSelection(to_select).fit_transform(d_)
                chunklist.append(X_subset)
            except Exception:  # pylint:disable=broad-except
                num_except += 1
                if num_except > 1:  # pylint:disable=no-else-return
                    warnings.warn(
                        'Could not perform diverse set selection for two attempts, will perform random choice')
                    return np.random.choice(len(data), k, replace=False)
                else:
                    print('will use greedy select now')
                    X_subset = _greedy_loop(d_, to_select, 'euclidean')
                    chunklist.append(X_subset)
    greedy_indices = []
    subset = np.vstack(chunklist)

    print((subset.shape))
    for d in subset:
        index = np.where(np.all(data == d, axis=1))[0][0]
        greedy_indices.append(index)

    del data
    del subset

    output = list(set(greedy_indices))
    print((len(output)))
    return output

예제 #12

0

파일 보기

def predict():
    import autokeras as ak

    iterations = 3
    uid = request.get_json()['user']
    trainsongs = request.get_json()['songs']
    trainsongs.sort()
    reclen = 15
    if 'reclen' in request.get_json():
        reclen = request.get_json()['reclen']

    if 'iterations' in request.get_json():
        iterations = request.get_json()['iterations']

    print(np.argmax([20, 30]))
    modelfile = krotos.modelfname(trainsongs, uid)
    print(modelfile)
    if os.path.isfile(modelfile):
        model = load_model(modelfile, custom_objects=ak.CUSTOM_OBJECTS)
    else:
        return {"status": "error", "reason": "train first"}, 500

    recordings = []
    for iteration in range(int(iterations)):
        recordings.append(krotos.getFeatures(duration=reclen))

    predictions = [None] * iterations
    i = 0
    n_features = 12
    for recording in recordings:
        recording = StandardScaler().fit_transform(recording)
        recording = recording.astype('float32')
        for row in np.array(recording):
            row = np.array([row[:n_features]])
            prediction = model.predict(row)
            if predictions[i] is not None:
                predictions[i] += np.array(prediction)
            else:
                predictions[i] = np.array(prediction)
        i += 1

    ret = []
    preds = []
    for prediction in predictions:
        ret.append(prediction.tolist())
        print(prediction)
        print(np.argmax(prediction))
        preds.append(trainsongs[np.argmax(prediction)])

    return {
        "predictions": ret,
        "ids": preds,
        "overall": int(np.bincount(preds).argmax())
    }

예제 #13

0

파일 보기

 def __init__(self, X, y):
     if not torch.is_tensor(X):
         X = StandardScaler().fit_transform(X)
         X = X.astype(np.float32)
         self.X = torch.from_numpy(X)
     else:
         raise ValueError("X should be numpy")
     if not torch.is_tensor(y):
         y = y.astype(np.float32)
         self.y = torch.from_numpy(y)
     else:
         raise ValueError("y should be numpy")

예제 #14

0

파일 보기

def test_32_64_decomposition_shape():
    """ Test that the decomposition is similar for 32 and 64 bits data """
    # see https://github.com/scikit-learn/scikit-learn/issues/18146
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    # Compare the shapes (corresponds to the number of non-zero eigenvalues)
    kpca = KernelPCA()
    assert (kpca.fit_transform(X).shape == kpca.fit_transform(
        X.astype(np.float32)).shape)

예제 #15

0

파일 보기

파일: data_loaders.py 프로젝트: OrionChocoPie/deep-learning-template

class Moons(Dataset):
    def __init__(self, n_samples, shuffle, noise):
        self.X, self.y = make_moons(n_samples=n_samples,
                                    shuffle=shuffle,
                                    noise=noise)

        self.X = StandardScaler().fit_transform(self.X)
        self.X, self.y = self.X.astype(np.float32), self.y.astype(np.int)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]), torch.from_numpy(
            np.array(self.y[idx]))

예제 #16

0

파일 보기

def decimate_data(datapath, doplot):
    df = pd.read_csv(datapath)
    df.columns = ["V" + str(i) for i in range(1, len(df.columns) + 1)]
    df.V1 = df.V1.astype(str)
    X = df.loc[:, "V2":]  # independent variables data
    y = df.V1  # dependent variable data
    print "Initial number of samples: " + str(len(df))
    encoder = preprocessing.LabelEncoder()
    """
        Initial Random Sampling: takes 20% sample of total
        input sample
    """
    pctg = 0.20
    print "Random sampling rate: " + str(pctg * 100) + "%"
    sample_len = int(len(df) * pctg)
    random_sample = X.take(np.random.permutation(len(df))[:sample_len])
    random_sample_encoded = random_sample.apply(encoder.fit_transform)
    print "Number of random samples: " + str(len(random_sample))
    """
        K-means clustering
    """
    x = np.array(random_sample_encoded)
    x = x.astype(int)

    ks = range(1, 16)
    kmeans = [KMeans(n_clusters=i, random_state=0) for i in ks]
    score = [kmeans[i].fit(x).score(x) for i in range(len(kmeans))]
    score = [-score[i] for i in range(len(ks))]
    """Plot for evaluating the Elbow for k-Means clustering"""
    if doplot:
        colors = np.random.rand(100)
        plt.suptitle("Elbow Plot", fontsize=14, fontweight='bold')
        plt.scatter(ks, score, c=colors, alpha=0.5)
        plt.plot(ks, score)
        plt.ylabel('Objective Function Value')
        plt.xlabel('Number of clusters')
        plt.show()
    """
        from elbow plot, elbow is found at k = 4
        next do the stratified sampling on those 4 clusters.
    """
    k_elbow = 4

    decimated_data = stratified_sampling(kmeans[k_elbow - 1], x)
    standard_data = decimated_data[:, :-1]
    standard_data = StandardScaler().fit_transform(standard_data.astype(float))
    cluster_id_col = decimated_data[:, -1:]
    return standard_data, cluster_id_col

예제 #17

0

파일 보기

파일: mybackend.py 프로젝트: Serfati/velodrome

 def predict(self, loc, time):
     rec = self.cur.execute(
         "SELECT StartStationID,StartStationLatitude, StartStationLongitude FROM BikeShare WHERE StartStationName like '"
         + loc + "' LIMIT 1").fetchone()
     if not rec:
         return -1
     sample = [time * 60, rec[0], rec[1], rec[2], time]
     from numpy import asarray
     df = pd.read_csv('BikeShare.csv')
     df.index = [x for x in range(1, len(df.values) + 1)]
     X = df[[
         'TripDuration', 'StartStationID', 'StartStationLatitude',
         'StartStationLongitude', 'TripDurationinmin'
     ]].values
     record = asarray(sample).reshape(1, -1)
     record = StandardScaler().fit(X).transform(record.astype(float))
     knn = pickle.load(open('knnpickle_file', 'rb'))
     pred = knn.predict(record)
     return pred[0]

예제 #18

0

파일 보기

def Get_onewell_Data(window_size):
    '''返回某口井的数据进行预测'''
    filepath = "D:\投的文章\Paper_基于1DCNN的岩相分类\图鉴\井的岩相\\57-04-0-5.txt"
    data = np.loadtxt(filepath, skiprows=1, dtype=str)
    attri = data[:, 1:-1].astype(float)
    attri = StandardScaler().fit_transform(attri)
    data[:, 1:-1] = attri.astype(str)
    '''得到每个窗口的数据'''
    data_list = []
    depth_list = []
    for y in range(window_size, len(data) - window_size):
        w_data = data[y - window_size:y + window_size + 1]
        attri = w_data[:, 1:-1].T.astype(float)
        label = w_data[window_size, -1].astype(float)
        depth = w_data[window_size, 0]
        data_list.append(attri)
        depth_list.append(depth)
    attri = np.array(data_list)
    attri = torch.tensor(attri)
    return attri, np.row_stack(depth_list)

예제 #19

0

파일 보기

파일: ClusterDBProceduralWeapon.py 프로젝트: DanieleGravina/ProceduralWeapon

	def cluster(self):

		cluster_file = open("cluster.txt", "w")

		print(self.data.shape)

		X = self.data

		X = StandardScaler().fit_transform(X)

		db = DBSCAN(eps=10, min_samples=2).fit(X)
		labels = db.labels_

		print(labels)

		# Number of clusters in labels, ignoring noise if present.
		n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)


		index = []
		fitness = []
		mean = []

		print("number of estimated clusters : %d" % n_clusters_ )
		cluster_file.write("number of estimated clusters : %d" % n_clusters_ + "\n")

		for k in range(n_clusters_):
			my_members = (labels == k)

			for i in range(len(X)):
				if my_members[i]:
					index += [i]
					if self.pure_data != None:
						num = 0
						if i % 2 == 0:
							num = int(i/2)
							fitness += [self.pure_data[num].fitness.values] 
						else :
							num = int((i-1)/2)
							fitness += [self.pure_data[num].fitness.values]

			
			if fitness != []:
				for i in range(len(fitness[0])):
					mean += [statistics.mean([ind[i] for ind in fitness])]

			cluster_file.write("index:"+ "\n")
			cluster_file.write(str(index) + "\n")
			cluster_file.write("fitness:"+ "\n")
			cluster_file.write(str(fitness)+ "\n")
			cluster_file.write("mean fitness:"+ "\n")
			cluster_file.write(str(mean)+ "\n")
			cluster_file.write("members:"+ "\n")
			cluster_file.write(str(X[my_members])+ "\n")

			print(index)
			print("members:")
			print(X[my_members])
			print("fitness:"+ "\n")
			print(str(fitness)+ "\n")

			index = []
			fitness = []
			mean = []

		mds = MDS(n_components=2)

		pos = mds.fit_transform(X.astype(np.float64))

		import matplotlib.pyplot as plt

		colors = list('bgrcmykbgrcmykbgrcmykbgrcmyk')

		plt.figure(2)

		for i in range(len(pos[:,0])):

			plt.plot(pos[i, 0], pos[i, 1], 'o', markerfacecolor=colors[labels[i]], markeredgecolor='k')



		import matplotlib.pyplot as plt
		from itertools import cycle

		plt.figure(1)
		plt.title("number of estimated clusters : %d" % n_clusters_)
		colors = list('bgrcmykbgrcmykbgrcmykbgrcmyk')
		colors_cluster = [colors[labels[i]] for i in range(len(X))]
		k = [i for i in range(len(X))]
		for j in range(9):
			plt.subplot(330 + j)
			plt.ylabel(label[j])
			#plt.ylim(limits[j][0], limits[j][1])
			plt.bar(k, X[:][j], color=colors_cluster)
		plt.show()

예제 #20

0

파일 보기

파일: plot.py 프로젝트: novocaine/pydbscan

import numpy as np

#from sklearn.cluster import DBSCAN
#from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler


##############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)
X = X.astype(np.float32)

##############################################################################
# Compute DBSCAN
import dbscan
labels = np.array(dbscan.dbscan(X, "sparse").run(0.3, 10))
core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

##############################################################################
# Plot result

예제 #21

0

파일 보기

class LogregClassifier:
    def __init__(self, lambd=1e-4):
        self.lambd = lambd

    def build(self, optimizer):
        x, y = self.inputs()
        pred = self.inference(x)
        loss, acc = self.loss(pred, y)
        train_op = self.train_op(loss, optimizer)

        self.ops = {
            'x': x,
            'y': y,
            'pred': pred,
            #'loss': self.ema.average(loss),
            #'acc': self.ema.average(acc),
            'loss': loss,
            'acc': acc,
            'train_op': train_op
        }

        return self.ops

    def inputs(self):
        x = tf.placeholder(tf.float32, shape=[None, self.X.shape[1]])
        y = tf.placeholder(tf.int32, shape=[None])
        return x, y

    def inference(self, x):
        with tf.variable_scope('logreg_scope') as self.scope:
            pred = tf.layers.dense(x, 10)
        return pred

    def loss(self, logits, y):
        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y, logits=logits),
                              axis=-1)
        loss += tf.add_n([
            tf.nn.l2_loss(v)
            for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                       scope=self.scope.name)
            if 'bias' not in v.name
        ]) * self.lambd

        p = tf.cast(tf.argmax(tf.nn.softmax(logits), axis=1), tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(p, y), tf.float32))

        self.ema = tf.train.ExponentialMovingAverage(decay=0.95)
        self.average_op = self.ema.apply([loss, acc])

        return loss, acc

    def train_op(self, loss, optimizer='adam'):
        if optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(1e-3)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope=self.scope.name)
            train_op = optimizer.minimize(loss, var_list=all_vars)

        with tf.control_dependencies([train_op]):
            train_op = tf.group(self.average_op)

        return train_op

    def prepare_data(self, dataset_name):
        self.dataset_name = dataset_name

        if dataset_name == 'digits':
            dataset = load_digits(n_class=10)
        elif dataset_name == 'mnist':
            dataset = fetch_mldata('MNIST original',
                                   data_home='/srv/hd1/data/vyanush/')

        self.X, self.Y = dataset.data, dataset.target
        self.X, self.Y = utils.shuffle(self.X, self.Y)

        self.X = StandardScaler().fit_transform(self.X.astype(np.float32))

        if dataset_name == 'mnist':
            self.X_train = self.X[:50000]
            self.Y_train = self.Y[:50000]
            self.X_val = self.X[50000:]
            self.Y_val = self.Y[50000:]

    def batch_iterator(self, n_epochs, batch_size):
        for epoch in range(n_epochs):
            indices = np.arange(self.X_train.shape[0])
            np.random.shuffle(indices)

            for pos in range(0, self.X_train.shape[0] - batch_size + 1,
                             batch_size):
                ind = indices[pos:pos + batch_size]
                yield self.X_train[ind], self.Y_train[ind]

예제 #22

0

파일 보기

path = ''

for i in range(0, 15):

    training_nr = 30 * i
    csvfilename = path + str(training_nr) + ''
    outputfile = str(training_nr) + ''

    # Load training set
    test_set = loadTestSet(csvfilename)
    X, Y = test_set.loadTestSet()

    #X = MinMaxScaler().fit_transform(X)
    X = StandardScaler().fit_transform(X)
    X_df = pd.DataFrame(data=X.astype(float))
    Y_df = pd.DataFrame(data=Y.astype(int))

    result = pd.concat([X_df, Y_df], axis=1)

    result.to_csv(outputfile,
                  sep=';',
                  header=[
                      'Band_6_0406_Mean', 'Band_7_1104_Mean',
                      'Band_8_0406_Mean', 'Band_8A_0406_Mean',
                      'Band_11_0406_Mean', 'Band_12_0406_Mean', 'AfgKode'
                  ],
                  float_format='%.10f',
                  index=False)

print "Done"

예제 #23

0

파일 보기

def Load_Data(ipath):
    data = np.loadtxt(ipath, skiprows=1, dtype=str)
    attri = data[:, 3:-1].astype(float)
    attri = StandardScaler().fit_transform(attri)
    data[:, 3:-1] = attri.astype(str)
    return data

예제 #24

0

파일 보기

파일: utils.py 프로젝트: entn-at/bmilde_unspeech

def get_test_blobs(n_samples=1000, d=2):
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.1, random_state=0)
    X = StandardScaler().fit_transform(X)
    X = X.astype(np.float32)
    return X

예제 #25

0

파일 보기

class DIGITSClassifier(optimizee.Optimizee):
    name = 'digits_classifier'

    def __init__(self,
                 num_units=20,
                 num_layers=1,
                 dataset_name='digits',
                 activation='sigmoid',
                 return_func=False):
        super(DIGITSClassifier, self).__init__()
        self.dataset_name = dataset_name

        if dataset_name == 'digits':
            dataset = load_digits(n_class=10)
        elif dataset_name == 'mnist':
            dataset = fetch_mldata('MNIST original',
                                   data_home='/srv/hd1/data/vyanush/')
        elif dataset_name == 'random':
            num_features = np.random.randint(low=1, high=100)
            data_size = np.random.randint(low=100, high=1000)

            w = np.random.normal(size=num_features)
            w0 = np.random.normal(size=1, scale=0.1)

            X = np.random.normal(size=(data_size, num_features))
            Y = X.dot(w) + w0 > 0

            dataset = Dataset(X, Y)

        self.X, self.Y = dataset.data, dataset.target
        self.X, self.Y = utils.shuffle(self.X, self.Y)

        self.X = StandardScaler().fit_transform(self.X.astype(np.float32))

        self.num_units = num_units
        self.num_layers = num_layers
        self.activation = activation
        self.return_func = return_func

        self.x_len = 0
        self.x_len_counted = False

    def get_x_dim(self):
        return self.dim

    def build(self):
        with tf.variable_scope('digits_classifier'):
            self.dim = tf.placeholder(tf.int32, [], name='dim')
            self.x = tf.placeholder(
                tf.float32, [None, None, None, self.X.shape[1]], name='X'
            )  # n_bptt_steps * batch_size * data_size * num_features
            self.y = tf.placeholder(tf.int32, [None, None, None], name='y')

    def loss(self, x, i):
        self.coord_pos = 0
        self.coord_vector = x
        dims = [self.num_units] * self.num_layers

        # self.x[i].shape == (batch_size, data_size, n_inputs)
        #pred = tf.transpose(self.x[i], perm=[0, 2, 1])
        pred = self.x[i][0]

        activation = getattr(tf.nn, self.activation)

        with tf.variable_scope(
                'nn_classifier/loss',
                custom_getter=self.custom_getter) as scope, tf.device(
                    '/gpu:0'):
            for n_outputs in dims:
                pred = tf.layers.dense(pred, n_outputs, activation=None)
                pred = tf.layers.batch_normalization(pred)
                pred = activation(pred)

            pred = tf.layers.dense(pred, 10)

            #pred = tf.transpose(pred, perm=[0, 2, 1]) # shape = (batch_size, data_size, n_classes)
            f = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y[i][0], logits=pred),
                               axis=-1)

            p = tf.argmax(tf.nn.softmax(pred), axis=-1)
            print(p.get_shape(), self.y[i][0].get_shape())
            #acc = tf.reduce_mean(tf.cast(tf.equal(tf.cast(p, tf.int32), self.y[i][0]), tf.float32), axis=1)
            acc = tf.reduce_mean(tf.cast(
                tf.equal(tf.cast(p, tf.int32), self.y[i][0]), tf.float32),
                                 axis=-1)

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                g = self.grad(x, f)

        if not self.x_len_counted:
            self.x_len = self.coord_pos
            self.x_len_counted = True
            self.vars_ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope=scope.name)

        if self.return_func:
            return f, g
        else:
            return acc, g

    def get_initial_x(self, batch_size=1):
        if self.dataset_name == 'mnist':
            self.batch_size = np.random.randint(low=1, high=256)
        else:
            self.batch_size = np.random.randint(low=1,
                                                high=self.X.shape[0] // 4 + 1)
        self.s = 0

        print("{} classifier; batch_size: {}".format(self.dataset_name,
                                                     self.batch_size))

        #w = np.random.normal(0, 0.01, size=(batch_size, self.x_len))
        w = np.zeros(self.x_len)

        print("x_len: ", w.shape)

        for name, d in self.coord_vars.items():
            start, end = d['pos']

            #with tf.variable_scope('dummy_{}'.format(name), reuse=False):
            #    shape = d['shape']
            #    #print(type(shape[0]), type(shape[1]))
            #    init = (d['initializer'] or glorot_uniform_initializer(dtype=tf.float32))(shape)
            #    dummy = tf.get_variable('dummy', initializer=init)
            #    dummy.initializer.run()
            dummy = (d['initializer']
                     or glorot_uniform_initializer(dtype=tf.float32))(
                         d['shape'])

            val = tf.get_default_session().run(dummy)
            w[start:end] = val.reshape(-1)

        return w[None, :]

    def get_new_params(self, batch_size=1):
        return {self.dim: self.x_len}

    def get_next_dict(self, n_bptt_steps, batch_size=1):
        x = np.zeros((n_bptt_steps, 1, self.batch_size, self.X.shape[1]))
        y = np.zeros((n_bptt_steps, 1, self.batch_size))

        for i in range(n_bptt_steps):
            if self.s + self.batch_size > self.X.shape[0]:
                self.s = 0
            pos_cur, pos_next = self.s, self.s + self.batch_size

            pos_cur = np.random.randint(low=0,
                                        high=self.X.shape[0] - self.batch_size)
            pos_next = pos_cur + self.batch_size

            x[i] = np.tile(self.X[None, pos_cur:pos_next], (batch_size, 1, 1))
            y[i] = np.tile(self.Y[None, pos_cur:pos_next], (batch_size, 1, 1))

            self.s = pos_next

        return {
            self.x: x,
            self.y: y,
        }

예제 #26

0

파일 보기

파일: main_drop.py 프로젝트: wxx0316/MATCHA

			min_count=0,
			sg=1,
			iter=1,
			workers=multiprocessing.cpu_count())
		wv = w2v.wv
		A = [wv[str(i)] for i in range(num_list[-1])]
		np.save("../%s_wv_%d_%s.npy" %
				(args.data, args.dimensions, args.walk), A)

		from sklearn.preprocessing import StandardScaler

		A = StandardScaler().fit_transform(A)

	A = np.concatenate(
		(np.zeros((1, A.shape[-1]), dtype='float32'), A), axis=0)
	A = A.astype('float32')
	A = torch.tensor(A).to(device)
	print (A.shape)

	node_embedding = Wrap_Embedding(int(
		num_list[-1] + 1), args.dimensions, scale_grad_by_freq=False, padding_idx=0, sparse=False)
	node_embedding.weight = nn.Parameter(A)

elif args.feature == 'adj':
	flag = False
	node_embedding = MultipleEmbedding(
		embeddings_initial,
		bottle_neck,
		flag,
		num_list).to(device)

예제 #27

0

파일 보기

파일: K Nearest Neighbours.py 프로젝트: chandbud5/Machine-Learning-with-Sklearn

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd

# READ DATA
df = pd.read_csv("teleCust1000t.csv")

# SPLIT INTO LABELS AND FEATURES
X = df[[
    'region', 'tenure', 'age', 'marital', 'address', 'income', 'ed', 'employ',
    'retire', 'gender', 'reside'
]].values
Y = df[['custcat']].values

# FEATURE NORMALISATION
X = StandardScaler().fit(X).transform(X.astype(float))
Y = Y.reshape(1000)

# DATA SPLITTING
train_x, test_x, train_y, test_y = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=4)

# CREATING MODEL
model = KNeighborsClassifier(n_neighbors=5)
model.fit(train_x, train_y)

prediction = model.predict(test_x)

print("Accuracy of a model is ", accuracy_score(test_y, prediction))

예제 #28

0

파일 보기

파일: nn.py 프로젝트: JGoodlad/CS260Final

        #out = torch.clamp(out, 1.0, 5.0)
        return out


df = pd.read_csv(inputFileName, sep=",")
df_percent = df.sample(frac=1).reset_index(drop=True).sample(frac=subsetFrac)

train = df_percent.sample(frac=1.0 - testFrac)
test = df_percent.drop(train.index)

train_labels = torch.tensor(
    np.expand_dims(train['Stars'].values.astype(np.float32), axis=1))
train_temp = train.drop('Stars', axis=1) if dropLastN == 0 else train.drop(
    'Stars', axis=1).iloc[:, :-dropLastN]  #
train_norm = StandardScaler().fit_transform(train_temp)
train_features = torch.tensor(train_norm.astype(np.float32))
train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(
    train_features, train_labels),
                                           batch_size=batch_size,
                                           shuffle=False,
                                           pin_memory=True)

test_labels = torch.tensor(
    np.expand_dims(test['Stars'].values.astype(np.float32), axis=1))
test_temp = test.drop('Stars', axis=1) if dropLastN == 0 else test.drop(
    'Stars', axis=1).iloc[:, :-dropLastN]  #
test_norm = StandardScaler().fit_transform(test_temp)
test_features = torch.tensor(test_norm.astype(np.float32))
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(
    test_features, test_labels),
                                          batch_size=batch_size,

예제 #29

0

파일 보기

파일: Step2.py 프로젝트: neelaksh007/CS7641_Assignment3

######
for dataset in ['biodeg.csv_header','voice.csv']:
    print("Working on",dataset,"data set...")

    data_df = pd.read_csv(dataset) 
    if  dataset == "biodeg.csv_header":      
        dataX = data_df.iloc[:,:41]
        dataY = data_df.iloc[:,41]
        dataset = "QSAR"
        comps = np.int32(np.linspace(2, 40,20))
    else:
        dataX = data_df.iloc[:,:20]
        dataY = data_df.iloc[:,20] 
        dataset = "VOICE"  
        comps = np.int32(np.linspace(2, 20,20))
    dataX = StandardScaler().fit_transform(dataX.astype('float64'))
    #######################################################
    split = train_test_split(dataX, dataY, test_size = 0.3,
    random_state = 42)
    (trainData, testData, trainTarget, testTarget) = split
    model = LinearSVC()
    model.fit(trainData, trainTarget)
    baseline = metrics.accuracy_score(model.predict(testData), testTarget)
    model = LinearSVC()
    model.fit(trainData, trainTarget)
    baseline = metrics.accuracy_score(model.predict(testData), testTarget)
    print("Running RP...")
    accuracies = []
    for comp in comps:
        # create the random projection
        #sp = SparseRandomProjection(n_components = comp)

예제 #30

0

파일 보기

train = train.drop(['date'], axis=1)
test = test.drop(['date', 'S1'], axis=1)
y = train['S1'].astype(np.float32)
train = train.drop(['S1'], axis=1)

######## check whether the data is linear################
#plt.scatter(train['S7'], train['S1'])
#plt.xlabel('S7')
#plt.ylabel('S1')
#plt.show()

######### PRE PROCESSING ########

train = StandardScaler().fit_transform(train)
train = train.astype(np.float32)

test = StandardScaler().fit_transform(test)
test = test.astype(np.float32)

######## RF MODEL ########
rf = RandomForestRegressor(n_estimators=150, max_depth=15)
rf.fit(train, y)

######## FEATURE IMPORTANCE ########

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

예제 #31

0

파일 보기

def patch_probability_generator(PARAMS, fl, Train_Params):
    startTime = time.process_time()
    labels_sp = []
    labels_mu = []
    pred_opDir = PARAMS['opDir'] + '/__Frame_Predictions_CNN/'
    if not os.path.exists(pred_opDir):
        os.makedirs(pred_opDir)
    result_fName = fl + '_fold' + str(PARAMS['fold']) + '_result'

    n_fft = PARAMS['n_fft'][PARAMS['Model']]
    n_mels = PARAMS['n_mels'][PARAMS['Model']]
    featName = PARAMS['featName'][PARAMS['Model']]

    if not os.path.exists(pred_opDir + result_fName + '.pkl'):
        fName_path = PARAMS['test_path'] + '/features/' + fl + '.npy'
        if not os.path.exists(fName_path):
            return {}
        fv = np.load(fName_path, allow_pickle=True)
        fv = get_featuregram(PARAMS,
                             PARAMS['feature_opDir'],
                             fl,
                             fv,
                             n_fft,
                             n_mels,
                             featName,
                             save_feat=True)
        if not 'HarmPerc' in featName:
            fv = fv.T
            fv = StandardScaler(copy=False).fit_transform(fv)
            fv = fv.T
        else:
            nDim = np.shape(fv)[0]
            fv_H = fv[:int(nDim / 2), :]
            fv_H = fv_H.T
            fv_H = StandardScaler(copy=False).fit_transform(fv_H)
            fv_H = fv_H.T
            fv_P = fv[int(nDim / 2):, :]
            fv_P = fv_P.T
            fv_P = StandardScaler(copy=False).fit_transform(fv_P)
            fv_P = fv_P.T
            fv = np.append(fv_H.astype(np.float32),
                           fv_P.astype(np.float32),
                           axis=0)

        nFrames = np.shape(fv)[1]
        annotations_mu, annotations_sp, music_marker, speech_marker = get_annotations(
            PARAMS['test_path'], fl, nFrames, PARAMS['opDir'])

        pred = np.empty([])
        pred_lab = np.empty([])
        batch_size = 10000
        labels_mu = []
        labels_sp = []
        # for batchStart in range(0, np.shape(fv_patches)[0], batch_size):
        for batchStart in range(0, np.shape(fv)[1], batch_size):
            # batchEnd = np.min([batchStart+batch_size, np.shape(fv_patches)[0]])
            batchEnd = np.min([batchStart + batch_size, np.shape(fv)[1]])
            # fv_patches_temp = fv_patches[batchStart:batchEnd,:]
            fv_temp = fv[:, batchStart:batchEnd]
            music_marker_temp = music_marker[batchStart:batchEnd]
            speech_marker_temp = speech_marker[batchStart:batchEnd]
            print('\tBatch: (',
                  batchStart,
                  batchEnd,
                  ') ',
                  np.shape(fv_temp),
                  ' mu=',
                  np.sum(music_marker_temp),
                  ' sp=',
                  np.sum(speech_marker_temp),
                  end=' ',
                  flush=True)

            fv_patches_temp = get_feature_patches(PARAMS, fv_temp, PARAMS['W'],
                                                  PARAMS['W_shift_test'],
                                                  featName)

            labels_mu_patches = cextract_patches(
                np.array(music_marker_temp, ndmin=2),
                np.shape(np.array(music_marker_temp, ndmin=2)), PARAMS['W'],
                PARAMS['W_shift_test']).astype(int)
            labels_mu_temp = (
                (np.sum(np.squeeze(labels_mu_patches, axis=1), axis=1) /
                 np.shape(labels_mu_patches)[2]) > 0.5).astype(int)

            labels_sp_patches = cextract_patches(
                np.array(speech_marker_temp, ndmin=2),
                np.shape(np.array(speech_marker_temp, ndmin=2)), PARAMS['W'],
                PARAMS['W_shift_test']).astype(int)
            labels_sp_temp = (
                (np.sum(np.squeeze(labels_sp_patches, axis=1), axis=1) /
                 np.shape(labels_sp_patches)[2]) > 0.5).astype(int)

            if 'Lemaire_et_al' in PARAMS['Model']:
                # TCN input shape=(batch_size, timesteps, ndim)
                fv_patches_temp = np.transpose(fv_patches_temp, axes=(0, 2, 1))

            if PARAMS['signal_type'] == 'music':
                pred_temp = Train_Params['model'].predict(x=fv_patches_temp)
                CM, acc, P, R, F1 = getPerformance(
                    np.array((pred_temp > 0.5).astype(int)), labels_mu_temp)
            elif PARAMS['signal_type'] == 'speech':
                pred_temp = Train_Params['model'].predict(x=fv_patches_temp)
                CM, acc, P, R, F1 = getPerformance(
                    np.array((pred_temp > 0.5).astype(int)), labels_sp_temp)

            pred_lab_temp = np.array(pred_temp > 0.5).astype(int)

            if np.size(pred) <= 1:
                pred = pred_temp
                pred_lab = pred_lab_temp
            else:
                pred = np.append(pred, pred_temp)
                pred_lab = np.append(pred_lab, pred_lab_temp)
            labels_mu.extend(labels_mu_temp)
            labels_sp.extend(labels_sp_temp)
            print(np.shape(fv_patches_temp), np.shape(pred_temp),
                  np.shape(pred), ' acc=', acc, F1)

        if PARAMS['signal_type'] == 'music':
            ConfMat, precision, recall, fscore = misc.getPerformance(
                pred_lab, labels_mu, labels=[0, 1])
            acc = np.round(np.sum(np.diag(ConfMat)) / np.sum(ConfMat), 4)
            print('Perf mu: ', acc, precision, recall, fscore)
        elif PARAMS['signal_type'] == 'speech':
            ConfMat, precision, recall, fscore = misc.getPerformance(
                pred_lab, labels_sp, labels=[0, 1])
            acc = np.round(np.sum(np.diag(ConfMat)) / np.sum(ConfMat), 4)
            print('Perf sp: ', acc, precision, recall, fscore)
        print('\n\n\n')

        probability_genTime = time.process_time() - startTime
        result = {
            'pred': pred,
            'pred_lab': pred_lab,
            'labels_sp': labels_sp,
            'labels_mu': labels_mu,
            'probability_genTime': probability_genTime,
            'ConfMat': ConfMat,
            'precision': precision,
            'recall': recall,
            'fscore': fscore,
            'accuracy': acc,
        }
        misc.save_obj(result, pred_opDir, result_fName)
        print('Test predictions saved!!!')
    else:
        result = misc.load_obj(pred_opDir, result_fName)

    return result

예제 #32

0

파일 보기

#
# Generate sample data for the DBSCAN test 
# 
# Lifted from http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#example-cluster-plot-dbscan-py
#

import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
    random_state=0)

X = StandardScaler().fit_transform(X)
X = X.astype(np.float64)
db = DBSCAN(eps=0.3, min_samples=10, metric='l2', algorithm='brute').fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

with open('dbscan.csv', 'w') as fscanout:
    with open('dbscan_labels.csv', 'w') as fscanlabout:
        for i in range(750):
            fscanout.write(",".join([str(x) for x in X[i,:]]) + "\n")
            fscanlabout.write(str(labels[i]) + "\n")

예제 #33

0

파일 보기

파일: test.py 프로젝트: zhangweijiqn/testPython

path = '/Users/zhangweijian01/Downloads/data.csv'
ori_data = pd.read_csv(path, header=0, sep='\t')
y_data = ori_data['Y']
x_data = ori_data.ix[:, 3:]
x_data = x_data.fillna(x_data.mean())
y_data = y_data.fillna(y_data.mean())

# to handle missing values
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(x_data)
data_imp = imp.transform(x_data)
x_scaler = data_imp

# scalar
x_scaler = StandardScaler().fit_transform(data_imp)
x_scaler = x_scaler.astype(np.float64, copy=False)

for i in range(0, len(x_scaler)):
    for j in range(0, len(x_scaler[i])):
        x_scaler[i][j] = float('%.4f' % (x_scaler[i][j]))

# sava preprocessed data to file
np.savetxt("newdata2.csv", x_scaler, delimiter=",")

f = open('newdata1.csv', 'w')
for i in range(0, len(x_scaler)):
    line = str(y_data[i])
    for j in range(0, len(x_scaler[i])):
        line = line + ',' + str(x_scaler[i][j])
    line += '\n'
    f.write(line)