def fit_dbscan(data, eps, min_samples, normalize=True,
               show=True, juxta_cluster_indices_grouped=None, threshold_legend=None):
    X = np.transpose(data)

    if normalize:
        from sklearn.preprocessing import minmax_scale
        minmax_scale(X, feature_range=(-1, 1), axis=0, copy=False)

    from sklearn.cluster import DBSCAN
    from sklearn import metrics
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    score = metrics.silhouette_score(X, labels, sample_size=5000)
    print('For eps={}, min_samples={}, estimated number of clusters={}'.format(eps, min_samples, n_clusters_))
    print("Silhouette Coefficient: {}".format(score))

    if show:
        pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped, threshold_legend)

    return db, n_clusters_, labels, core_samples_mask, score
示例#2
0
def combine(a, b, w):
    matches = {}

    # split dictionaries into keys and values
    al = [x for x in a.items()]
    ak, av = zip(*al)
    bl = [x for x in b.items()]
    bk, bv = zip(*bl)

    # scale the values in the range 0-1
    a_scaled = preprocessing.minmax_scale(av, feature_range=(0,1))
    b_scaled = preprocessing.minmax_scale(bv, feature_range=(0,1))

    # build numpy structured arrays combining scaled values and original keys
    names = ['keys', 'values']
    formats = ['S225', 'f8']
    dtype = dict(names=names, formats=formats)
    anp = np.array(list(zip(ak,a_scaled)), dtype=dtype)
    bnp = np.array(list(zip(bk,b_scaled)), dtype=dtype)

    # iterate over numpy structures creating a weighted average between values with the same key
    for i, t1 in np.ndenumerate(anp):
        for j, t2 in np.ndenumerate(bnp):
            if anp['keys'][i] == bnp['keys'][j]:
                stack = np.vstack((anp['values'][i], bnp['values'][j]))
                matches[anp['keys'][i].decode("utf-8")] = np.average(stack, axis=0, weights=w)[0]   # python dictionary

    return matches
示例#3
0
def train_model_with_feature(config_name, clf_name, fill_na_opt, PCA_n_comp, clf, X, X_test, y):
    if PCA_n_comp!=-1:
        pca = PCA(PCA_n_comp) #PCA dimension reduction
        logger.info('PCA fit on count matrix')
        # rescale num to (0,1)
        X_all = pca.fit_transform( minmax_scale(np.vstack([X, X_test])) )
        X, X_test = X_all[:X.shape[0], :], X_all[X.shape[0]:, :]
        logger.info('PCA fit done')

    logger.info('start training')
    print 'training size', X.shape, 'test size', X_test.shape
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9)
    if clf_name=='xgb':
        clf.fit(X_train,y_train,eval_metric='mlogloss')
    else:
        clf.fit(X_train,y_train)
    logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') train log-loss='\
            +str(log_loss(y_train, clf.predict_proba(X_train))))
    logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') validate log-loss='\
            +str(log_loss(y_val, clf.predict_proba(X_val))))

    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    df_test[group_list] = y_pred
    logger.info('finish training')
    # , 'phone_brand_en', 'device_model_en'
    df_test.to_csv('output/'+config_name+'-'+clf_name+'-'+fill_na_opt+'-pca'+\
            str(PCA_n_comp)+'-'+str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))\
            +'.csv', columns=['device_id']+group_list, index=False)
    logger.info('finish outputing result')
示例#4
0
 def make_preprocessing_pandas(self, _df_csv_read_ori, _preprocessing_type , _label):
     """ SKLearn을 사용해서 Pandas를 Proprocessing
         label은 Preprocessing 하면 안됨
     Args:
       params:
         * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
         * _df_csv_read_ori : pandas dataframe
         * _label
     Returns:
       Preprocessing DataFrame
     """
     if _preprocessing_type == None or _preprocessing_type == 'null':
         logging.info("No Preprocessing")
         result_df =  _df_csv_read_ori
     else :
         logging.info("Preprocessing type : {0}".format(_preprocessing_type))
         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
         for i, v in _df_csv_read_ori.dtypes.iteritems():
             if v in numerics:
                 if i not in _label:
                     #preprocessing_types = ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
                     #_preprocessing_type = ['maxabs_scale']
                     if 'scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'minmax_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.minmax_scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'robust_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.robust_scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'normalize' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.normalize(_df_csv_read_ori[i].fillna(0.0))
                     if 'maxabs_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.maxabs_scale(_df_csv_read_ori[i].fillna(0.0))
         result_df = _df_csv_read_ori
     return result_df
def resample(spectrum, resampled_header_broadcast, label_col=None, convolve=False, normalize=True):
    """Resamples the spectrum so that the x-axis starts at low and ends at high, while
    keeping the delta between the wavelengths"""
    resampled_header = resampled_header_broadcast.value
    if label_col is not None:
        logger.debug(spectrum.columns)
        without_label = spectrum.drop(label_col, axis=1)
        without_label.columns = pd.to_numeric(without_label.columns, errors="ignore")
    else:
        without_label = spectrum
    if convolve:
        to_interpolate = convolution.convolve(without_label.iloc[0].values, convolution.Gaussian1DKernel(7),
                                              boundary="extend")
    else:
        to_interpolate = without_label.iloc[0].values
    logger.debug(without_label)
    interpolated = np.interp(resampled_header, without_label.columns.values, to_interpolate)
    interpolated = interpolated[3:-3]  # remove some weird artefacts that might happen because of convo/interpolation
    if normalize:
            interpolated = prep.minmax_scale([interpolated], axis=1)
    logger.debug("Interpolated:%s", interpolated)
    interpolated_df = pd.DataFrame(data=interpolated, columns=resampled_header[3:-3], index=spectrum.index.values)
    if label_col is not None:
        interpolated_df[label_col] = spectrum[label_col]
    return interpolated_df
示例#6
0
def scale(x):
    m, n = x.shape
    x = np.reshape(x, (m*n, 1))
    x = minmax_scale(x, feature_range=(0, 1), axis=0)
    data = np.reshape(x, (m, n))

    return data
def data_preprocess_musk_svm(bags):
    bag_labels = list()
    instances = list()
    for bag in bags:
        [n_instances, _] = bag['instances'].shape
        if 0 == bag['label']:
            bag['label'] = 0
            bag['inst_labels'] = np.zeros([n_instances, ])
            bag_labels.append(0)
        else:
            bag['label'] = 1
            bag['inst_labels'] = np.ones([n_instances, ])
            bag_labels.append(1)
        instances.extend(bag['instances'])
        # bag['instances'] /= 100
        # bag['instances'] = preprocessing.minmax_scale(bag['instances'], axis=1, feature_range=(-1, 1))
        # bag['instances'] = preprocessing.normalize(bag['instances'], axis=1)
    instances = np.asarray(instances)
    instances = preprocessing.minmax_scale(instances, axis=0, feature_range=(-1, 1))
    # instances = preprocessing.minmax_scale(instances, axis=0, feature_range=(0, 1))
    # instances = preprocessing.normalize(instances, norm='l2', axis=0)
    inst_idx = 0
    for bag in bags:
        [n_instances, _] = bag['instances'].shape
        bag['instances'] = instances[inst_idx: inst_idx + n_instances, :]
        inst_idx += n_instances
    return bags, bag_labels
    def get_outliers_by_ransac(self, table, column_indexes):
        '''
        Get outliers using RANSAC regression, which deals better with large outliers in the y direction, 
        and faster than Huber when the number of samples is very large. 
        RANSAC outpus perfect precision (100%) but far from perfect recall (could be 50% - 60%) in our experiments. 
        '''
        X = table[ :, column_indexes[ :-1]].astype(float)
        X = utils.enforce_columns(X)
        y = table[ :, column_indexes[-1]].astype(float)

        # preprocessing doesn't make any difference for RANSAC in our experiments
        #x = preprocessing.minmax_scale(x)
        #y = preprocessing.minmax_scale(y)

        model_ransac = RANSACRegressor(LinearRegression())
        model_ransac.fit(X, y)

        inlier_mask = model_ransac.inlier_mask_
        outlier_mask = np.logical_not(inlier_mask)
        outliers = [idx for idx, val in enumerate(outlier_mask) if val]

        residuals = abs(model_ransac.predict(X) - y)
        confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9

        return (outliers, confidences)
示例#9
0
文件: ml.py 项目: chqiwang/box-office
def ensemble():
    with open('train_test_norm.pickle') as f:
        train_set,test_set = pickle.load(f)
    m,n = len(train_set),len(train_set[0]['search_idx'])
    X = np.zeros([m,n])
    Y = np.zeros([m,])
    for i in range(m):
        movie = train_set[i]
        X[i,:] = np.asarray(movie['search_idx'],float)
        Y[i] = float(movie['total_money'])
    
    Y /= np.max(Y)
    X = minmax_scale(X)
    W = calc_W(Y)
    
    result = []
    kf = cross_validation.KFold(n=m, n_folds=10, shuffle=True,random_state=None)
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        W_train, W_test = W[train_index], W[test_index]
        
        boost = train_with_search_index(X_train,y_train,W_train)
        classifier,regressors = train_with_kmeans(X_train,y_train,W_train)
        
        result.append(score_en(boost,classifier,regressors,X_test,y_test))
示例#10
0
def parse_file(file_name):
    with open(file_name) as f:
        lines = f.readlines()
    with open('params.pickle') as f:
        params = pickle.load(f)
    
    m,n = len(lines),31+31+50+len(params['T'])+len(params['P'])+1+1+1
    X = np.zeros([m,n])
    names = []
    
    for i,line in enumerate(lines):
        units = line.split(',')
        units = [unit.decode('utf-8') for unit in units]
        X[i,0:31] = np.array([float(units[c]) for c in range(31)],float)
        X[i,31:62] = np.array([float(units[c]) for c in range(31,62)],float)
        img_path = units[62]
        img = plt.imread(img_path)
        img[:] = 0
        types = set(units[63].split(';'))
        for c,typ in enumerate(params['T']):
            if typ in types:
                X[i,112+c] = 1
        P = units[64]
        X[i,112+len(params['T'])+params['P'][P]] = 1
        X[i,112+len(params['T'])+len(params['P'])] = float(units[65])
        X[i,112+len(params['T'])+len(params['P'])+1] = float(units[67])
        X[i,112+len(params['T'])+len(params['P'])+2] = float(units[66]) - params['B']
        names.append(units[68].strip())
    
    X = minmax_scale(X[:,0:31])
    return X,names
示例#11
0
def calcImageHist(imagePath, nbins):
    img = io.imread(imagePath)

    imgComposed = np.apply_along_axis(convColors, 2, img)

    imgNorm = preprocessing.minmax_scale(imgComposed.astype(float))[0]

    return np.histogram(imgNorm, bins=nbins)
 def minMaxScale(df):
     """
     将数据最大最小化标准化
         :param df: 传入DataFrame
         :returns: 标准化后的数据
     """   
     if not isinstance(df, pd.DataFrame):
         raise Exception("df is not DataFrame!")
     return preprocessing.minmax_scale(df)
def scale_bag(bags):
    bag_labels = list()
    for bag in bags:
        [n_instances, _] = bag['instances'].shape
        if 0 == bag['label']:
            bag['inst_labels'] = np.zeros([n_instances, ])
            bag_labels.append(0)
        else:
            bag['inst_labels'] = np.ones([n_instances, ])
            bag_labels.append(1)
        bag['instances'] = preprocessing.minmax_scale(bag['instances'], axis=1, feature_range=(0, 1))
    return bags, bag_labels
示例#14
0
 def rank_plt(self, ylim=(0.0, 1.1)):
     if not hasattr(self, 'rank'): self.rank_get()
     fig, axs = plt.subplots(figsize=(13, 5))
     axs.set_ylim(ylim)
     axs.grid(True)
     # movIdx
     X = np.array([self[uuid].logMI() for uuid in self.rank])
     axs.plot(minmax_scale(X), color='y', label='movIdx')
     # number of bSite reports
     X = np.array([np.sum(self[uuid].xLastR('site', self.xLast)) for uuid in self.rank], dtype='float')
     axs.plot(minmax_scale(X), color='c', label='bSites')
     # number of adult reports
     X = np.array([np.sum(self[uuid].xLastR('adult', self.xLast)) for uuid in self.rank], dtype='float')
     axs.plot(minmax_scale(X), color='m', label='adults')
     # score
     X = np.array([self[uuid].score for uuid in self.rank])
     axs.plot(X, color='b', label='uScore')
     # legend
     plt.legend(loc='upper left')
     # plt.title('+++ ranked scoring +++')
     plt.show()
    def sampling_pdf(self, n_sample=20):
        model = self.model
        values = self.values

        vd_samples = values[random.choice(len(values), n_sample)]
        pvalues = model.predict(vd_samples)
        pvalues = minmax_scale(pvalues, feature_range=(0, 1), axis=0, copy=False)

        header = list(self.df.columns) + ["proba"]
        body = np.concatenate([vd_samples, np.asarray([pvalues]).T], axis=1)
        sorted_body = body[body[:,-1].argsort()[::-1]]
        
        report_df = pd.DataFrame(data=sorted_body, columns=header)
        return report_df
def ConstructArrays(array):
    shape = np.shape(array)
    width = shape[1]
    onesArray = np.ones((len(array),1))
    splitArrays = np.split(array, [width - 1], 1)
    
    splitArrays[0] = prep.minmax_scale(splitArrays[0])
    featuresArray = np.hstack([onesArray, splitArrays[0]])
    outputArray = splitArrays[1] 
    'prep.minmax_scale(splitArrays[1])' 
    
    
    
    
    return featuresArray, outputArray
示例#17
0
文件: segment.py 项目: jupito/dwilib
def normalize(img):
    """Normalize image intensity."""
    assert img.ndim == 4
    logging.info('Preprocessing: %s', array_info(img))
    info = img.info
    original_shape = img.shape
    img = img.reshape((-1, img.shape[-1]))

    img = preprocessing.minmax_scale(img)
    # img = preprocessing.scale(img)
    # img = preprocessing.robust_scale(img)

    img = img.reshape(original_shape)
    img = dwi.image.Image(img, info=info)
    return img
示例#18
0
    def preprocess(self):
        """
        Normalization of data between 0 and 1 and subtraction of the nuggets
        Returns:
            pandas.core.frame.DataFrame: Dataframe containing the transformed data
            pandas.core.frame.DataFrame: Containing the substracted nuggets

        """
        import sklearn.preprocessing as skp

        # Normalization
        scaled_data = pn.DataFrame(skp.minmax_scale(self.exp_var_raw[self.properties]), columns=self.properties)

        # Nuggets
        nuggets = scaled_data[self.properties].iloc[0]
        processed_data = scaled_data - nuggets
        return processed_data, nuggets
示例#19
0
def add_golden_features(X):
    with_added_features = X
    for i in range(len(X.columns)):
        stdout.write("\r%d out of %d" %(i,len(X.columns)))
        stdout.flush()
        for j in range(i+1, len(X.columns)):
            new_col_mult =  preprocessing.scale(np.array(X.ix[:,i]*X.ix[:,j]))
            new_col_plus =  preprocessing.scale(np.array(X.ix[:,i]+X.ix[:,j]))
            with_added_features = np.column_stack((with_added_features, new_col_mult, new_col_plus))
        new_col_log = np.log(preprocessing.minmax_scale(np.array(X.ix[:,i]), feature_range=(0,1)))
        new_col_rank = preprocessing.scale(np.array(rankdata(X.ix[:,i])))
        with_added_features = np.column_stack((with_added_features, new_col_log, new_col_rank))
    reduced_38 = dim_reduce(X.ix[:, :38])
    with_added_features = np.column_stack((with_added_features, reduced_38))
    stdout.write("\n")
    s0, s1 = with_added_features.shape
    columns = ['gf_'+str(j) for j in range(s1)]
    df2 = pd.DataFrame(with_added_features, columns=columns)
    return df2
def scale_inst(bags):
    bag_labels = list()
    instances = list()
    for bag in bags:
        [n_instances, _] = bag['instances'].shape
        if 0 == bag['label']:
            bag['inst_labels'] = np.zeros([n_instances, ])
            bag_labels.append(0)
        else:
            bag['inst_labels'] = np.ones([n_instances, ])
            bag_labels.append(1)
        instances.extend(bag['instances'])
    instances = np.asarray(instances)
    instances = preprocessing.minmax_scale(instances, axis=1, feature_range=(0, 1))
    inst_idx = 0
    for bag in bags:
        [n_instances, _] = bag['instances'].shape
        bag['instances'] = instances[inst_idx: inst_idx + n_instances, :]
        inst_idx += n_instances
    return bags, bag_labels
示例#21
0
def cell_fd_extention(fname_org='sheet.gz/cell_db.cvs.gz', camera_bit_resolution=14):
    cell_df = pd.read_csv(fname_org)
    Limg, Lx, Ly = cell_fd_info(cell_df)

    cell_df_ext = cell_df.copy()

    # Fresnel diffraction
    cell_img_fd_a = cell_fd_conv(cell_df)
    cell_df_ext['freznel image'] = cell_img_fd_a.reshape(-1)

    # max_v, min_v = np.max(cell_df["image"]), np.min(cell_df["image"])
    cell_img_fd_a_2d = cell_img_fd_a.reshape(Limg, -1)
    cell_img_fd_a_2d_scale = preprocessing.minmax_scale(
        np.abs(cell_img_fd_a_2d)) * (2**camera_bit_resolution)
    cell_img_fd_a_2d_scale_200x144x144 = cell_img_fd_a_2d_scale.reshape(
        Limg, Lx, Ly).astype(int)
    cell_df_ext[
        'mag freznel image'] = cell_img_fd_a_2d_scale_200x144x144.reshape(-1)

    return cell_df_ext
示例#22
0
def dataEncodeFor(approach, oriData):
    if(approach is 'AE'):
        encodedAEData = dataEncode(oriData)
        npen = encodedAEData.data.numpy()
        np.save('aeEncode.npy', npen)
        return npen
    if (approach is 'PCA'):
        return myPca.dataEncode(oriData, white=False)
    if (approach is 'PCA2'):
        return myPca.dataEncode2(oriData, white=False,stay=0.85)
    if (approach is 'KPCA'):
        kpcaEncode =  myKpca.dataEncode('rbf', oriData, 90)
        return kpcaEncode
    if (approach is 'KPCA2'):
        kpcaEncode =  myKpca.dataEncode('rbf', oriData, 90)
        return kpcaEncode
    if (approach is 'to01'):
        return preprocessing.minmax_scale(oriData)
    if (approach is 'Normal'):
        return preprocessing.scale(oriData)
示例#23
0
    def prepareDataSet(self, iris):
        self.int_num_classes = numpy.unique(iris.target).shape[0]

        self.int_set_size = iris.data.shape[0]
        self.int_num_features = iris.data.shape[1]

        ## normalize data
        self.data = preprocessing.normalize(iris.data)
        self.data = preprocessing.minmax_scale(self.data, (-1, 1))

        # load data in arrays
        for i in range(0, len(self.data)):
            Y = iris.target[i]
            X = numpy.append(self.data[i], 1)  ## bias input = 1
            check_i = i % self.int_num_per_class
            if check_i < self.int_training_size:
                self.training.append([X, Y])
            else:
                self.testing.append([X, Y])
        numpy.random.shuffle(self.training)
        numpy.random.shuffle(self.testing)
def normalise_data(train_data: np.ndarray, test_data: np.ndarray):
    from sklearn import preprocessing

    print("np.max(train_data): " + str(np.max(train_data)))
    print("np.ptp(train_data): " + str(np.ptp(train_data)))

    normalised_1 = 1 - (train_data - np.max(train_data)) / -np.ptp(train_data)
    normalised_2 = preprocessing.minmax_scale(train_data, axis=1)

    print(train_data[0])

    train_data /= 16
    test_data /= 16

    print("Are arrays equal: " + str(np.array_equal(normalised_2, train_data)))
    print("Are arrays equal: " + str(np.array_equal(normalised_1, train_data)))

    for i in range(0, 1):
        print(train_data[i])
        print(normalised_1)
        print(normalised_2)
示例#25
0
def dataEncodeFor(approach, oriData,stay=0.95):
    if(approach is 'AE'):
        encodedAEData = dataEncode(oriData, stay if stay%1==0 else stay*331)
        npen = encodedAEData.data.numpy()
        # np.save('aeEncode.npy', npen)
        return npen
            # np.load('aeEncode.npy')
        # return npen
    if (approach is 'PCA'):
        return myPca.dataEncode(oriData, white=False,stay=stay)
    if (approach is 'PCA2'):
        return myPca.dataEncode2(oriData, white=False,stay=0.85)
    if (approach is 'KPCA'):
        kpcaEncode =  myKpca.dataEncode('rbf', oriData, stay if stay%1==0 else stay*331)
        return kpcaEncode
    if (approach is 'KPCA2'):
        kpcaEncode =  myKpca.dataEncode('rbf', oriData, stay if stay%1==0 else stay*331)
        return kpcaEncode
    if (approach is 'to01'):
        return preprocessing.minmax_scale(oriData)
    if (approach is 'Normal'):
        return preprocessing.scale(oriData)
示例#26
0
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL):
    # data = pd.read_csv(file_in, dtype='str')
    # data['DateTime'] = pd.to_datetime(
    #     data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str),
    #     format='%Y%m%d%H%M%S')
    # data = data.set_index('DateTime')
    # data = pd.Series(data['<CLOSE>']).map(float)
    # data = data.resample('M').fillna(method='pad')
    # data = preprocessing.minmax_scale(data)
    # data_t = data[6:]
    # data_f = data.reshape(-1, 6)
    # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)])
    # np.save(file_out[0], data_f[:len(data_f) - 1])
    # np.save(file_out[1], data_t)
    data = preprocessing.minmax_scale(pd.read_pickle(
        file_in)['close'])
    data_m = np.array([[data[i + x * 24 * 24] for x in range(6)]
                       for i in range(len(data) - 6 * 24 * 24 + 1)])
    data_m = data_m.reshape(-1, 6)
    data_s = np.array([data[i + 6 * 24 * 24]
                       for i in range(len(data) - 6 * 24 * 24)])
    np.save(file_out[0], data_m[:len(data_m) - 1])
    np.save(file_out[1], data_s)
示例#27
0
def dataEncode(data, stay, EPOCH = 500, BATCH_SIZE = 32, LR = 0.0001 ):
    oriData = data
    scData = preprocessing.minmax_scale(oriData)
    scData = torch.from_numpy(scData).type(torch.FloatTensor)
    oriSc = Variable(scData)
    train_loader = Data.DataLoader(dataset=scData, batch_size=BATCH_SIZE, shuffle=True)
    ae = AutoEncoder(stay)
    optimizer = torch.optim.Adam(ae.parameters(), lr=LR)
    loss_func = nn.MSELoss()

    for epoch in range(EPOCH):
        for step, x in enumerate(train_loader):
            tx = Variable(x)
            ty = Variable(x)
            encoded, decoded = ae(tx)
            loss = loss_func(decoded, ty)  # mean square error
            optimizer.zero_grad()  # clear gradients for this training step
            loss.backward()  # backpropagation, compute gradients
            optimizer.step()  # apply gradients
            if step % 1 == 0:
                print('Epoch: ', epoch, '| train loss: %.6f' % loss.data[0])
                # print(len(loss))
    encoded_data, _ = ae(oriSc)
    return encoded_data
    def get_outliers_by_huber(self, table, column_indexes):
        '''
        Get outliers using huber regression, which outperforms RANSAC, 
        but doesn't scale well when the number of samples are very large. 
        Huber outputs both perfect precision (100%) and recall (100%) in our experiments.
        '''
        X = table[ :, column_indexes[ :-1]].astype(float)
        X = utils.enforce_columns(X)
        y = table[ :, column_indexes[-1]].astype(float)

        # preprocessing could make HUBER fail on some dataset in our experiments 
        #x = preprocessing.minmax_scale(x)
        #y = preprocessing.minmax_scale(y)

        model_huber = HuberRegressor()
        model_huber.fit(X, y)

        outlier_mask = model_huber.outliers_
        outliers = [idx for idx, val in enumerate(outlier_mask) if val]

        residuals = abs(model_huber.predict(X) - y)
        confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9

        return (outliers, confidences)
示例#29
0
def build_images():
    """Completely load, resize, and save the images for training. Main function."""
    # get image file paths for each image type
    path_stub = r'D:\Users\James\Dropbox\Programming\Python\Projects\pylinac test files'
    pf_files = get_image_files(osp.join(path_stub, 'Picket Fences'))
    pipspro_files = get_image_files(osp.join(path_stub, '2D Image quality phantoms', 'PipsPro'))
    leeds_files = get_image_files(osp.join(path_stub, '2D Image quality phantoms', 'Leeds'))
    wl_files = get_image_files(osp.join(path_stub, 'Winston-Lutz'))
    # cbct_files = get_image_files(osp.join(path_stub, 'CBCTs'))
    filepaths = pf_files + pipspro_files + leeds_files + wl_files
    print("{} files found".format(len(filepaths)))

    # preallocate
    total_array = np.zeros((len(filepaths), 10000), dtype=np.float32)
    print("Training array preallocated")

    # resize each image and add to a training array
    start = time.time()
    futures = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as exec:
        for idx, path in enumerate(filepaths):
            future = exec.submit(process_image, path)
            futures[future] = idx
    for idx, future in enumerate(concurrent.futures.as_completed(futures)):
        total_array[futures[future], :] = future.result()
    print("Training array set in {:.2f}s".format(time.time() - start))

    # feature scale the images
    scaled_array = preprocessing.minmax_scale(total_array, feature_range=(0, 1), axis=1)
    print("Training array scaled")

    # save arrays to disk for future use
    np.save(osp.join(osp.dirname(osp.abspath(__file__)), 'images'), scaled_array)
    np.save(osp.join(osp.dirname(osp.abspath(__file__)), 'labels'), np.concatenate(
        (np.repeat(0, len(pf_files)), np.repeat(1, len(pipspro_files)), np.repeat(2, len(leeds_files)), np.repeat(3, len(wl_files)))))
    print("Images build")
示例#30
0
def scale_data_minmax(X):
    #Xt = preprocessing.scale(X)
    Xt = preprocessing.minmax_scale(X)

    return Xt
示例#31
0
# 2. Preparing More Features
from sklearn.preprocessing import minmax_scale
# The holdout set has a missing value in the Fare column which
# we'll fill with the mean.
holdout["Fare"] = holdout["Fare"].fillna(train["Fare"].mean())
columns = ['SibSp', 'Parch', 'Fare']

train['Embarked'] = train['Embarked'].fillna('S')
train = create_dummies(train, "Embarked")

holdout['Embarked'] = holdout['Embarked'].fillna('S')
holdout = create_dummies(holdout, "Embarked")

for col in columns:
    train[col + "_scaled"] = minmax_scale(train[col])
    holdout[col + "_scaled"] = minmax_scale(holdout[col])

print(train)

# 3. Determining the Most Relevant Features
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pandas as pd

columns = ['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'SibSp_scaled', 'Parch_scaled', 'Fare_scaled']
def rating_pred_binary(rating_pred):
    # Normalize the predictions
    rating_pred[DEFAULT_PREDICTION_COL] = minmax_scale(
        rating_pred[DEFAULT_PREDICTION_COL].astype(float))
    return rating_pred
def save_left_out_matrix(alpha,
                         tax_ids,
                         left_out_tax_id,
                         blast_folder='./blast_files/',
                         network_folder='./network_files/',
                         block_matrix_folder='./block_matrix_files',
                         version=1):
    '''
    Function assumes all necessary block matrices have already been computed, and network files (including left out one, for protein ids only) have been downloaded from STRING
    Need to make a function to compute S^{T}S (bipartite graph projection) for every IsoRank matrix related to the left-out matrix,
    and then averages them to get the predicted network. Save this network, and then using BLAST matrix of species with itself and compute IsoRank between the predicted
    network and the BLAST network.
    '''
    print('Save left out matrix!')
    print('Tax ids:')
    print(tax_ids)
    tax_id_combos = []
    used_tax_ids = [tax_id for tax_id in tax_ids if tax_id != left_out_tax_id]
    for ii in range(0, len(tax_ids)):
        tax_id_combos.append((tax_ids[ii], left_out_tax_id + '-leaveout'))
    print(tax_id_combos)
    pool = Pool(int(multiprocessing.cpu_count()))
    #isorank_blocks = pool.starmap(load_single_isorank_block, zip(tax_id_combos, itertools.repeat(alpha), itertools.repeat(block_matrix_folder)))
    network_file = network_folder + left_out_tax_id + "_networks_string.v11.0.pckl"
    leftout_prot2index, A, left_out_net_prots = load_adj(network_file)
    if version == 1:  # S transpose S
        print('VERSION 1 (S^{T}S)')
        isorank_blocks = [
            load_single_isorank_block(*args)
            for args in zip(tax_id_combos, itertools.repeat(alpha),
                            itertools.repeat(block_matrix_folder))
        ]
        replacements = [
            get_s_transpose_s(isorank_block.todense())
            for isorank_block in isorank_blocks
        ]
    elif version == 2:  # S matrix network projection
        print(
            'VERSION 2 (S^{T}AS) S MATRIX NETWORK PROJECTION WITH NONLEFTOUT ORGANISM\'S NETWORK'
        )
        replacements = []
        for tax_id_combo in tax_id_combos:
            nonleftout_taxon = tax_id_combo[0]
            network_file = network_folder + nonleftout_taxon + "_networks_string.v11.0.pckl"
            _, nonleftout_net, _ = load_adj(network_file)
            isorank_block = load_single_isorank_block(tax_id_combo, alpha,
                                                      block_matrix_folder)
            replacements.append(
                get_s_transpose_A_s(isorank_block.todense(),
                                    nonleftout_net.todense()))
    elif version == 3:  # blast only baseline
        print('VERSION 3 (R^{T}R) BLAST ONLY')
        replacements = []
        for tax_id_combo in tax_id_combos:
            nonleftout_taxon = tax_id_combo[0]
            network_file = network_folder + nonleftout_taxon + "_networks_string.v11.0.pckl"
            prot2index_1, _, _ = load_adj(network_file)
            R = load_blast_from_taxa(nonleftout_taxon, left_out_tax_id,
                                     prot2index_1, leftout_prot2index,
                                     blast_folder)
            replacements.append(get_s_transpose_s(R.todense()))
    elif version == 4:  # blast network projection
        print('VERSION 4 (R^{T}AR) BLAST NETWORK PROJECTION')
        replacements = []
        for tax_id_combo in tax_id_combos:
            nonleftout_taxon = tax_id_combo[0]
            network_file = network_folder + nonleftout_taxon + "_networks_string.v11.0.pckl"
            prot2index_1, nonleftout_net, _ = load_adj(network_file)
            nonleftout_net = nonleftout_net.todense()
            R = load_blast_from_taxa(nonleftout_taxon, left_out_tax_id,
                                     prot2index_1, leftout_prot2index,
                                     blast_folder).todense()
            replacements.append(get_s_transpose_A_s(R, nonleftout_net))
    else:
        raise NotImplementedError(
            'Version for making left out network matrix must be either 1, 2, 3, 4.'
        )
    replacements = np.array(replacements)
    print(replacements.shape)

    #replacements = pool.starmap(get_ss_transpose, zip(isorank_blocks))
    left_out_matrix = np.mean(replacements, axis=0)
    print(left_out_matrix.shape)
    density = np.count_nonzero(left_out_matrix) / (left_out_matrix.shape[0] *
                                                   left_out_matrix.shape[1])
    print(left_out_matrix)
    print('Density of left out matrix: ' + str(density))
    left_out_matrix = minmax_scale(left_out_matrix)
    print(left_out_matrix)
    density = np.count_nonzero(left_out_matrix) / (left_out_matrix.shape[0] *
                                                   left_out_matrix.shape[1])
    print('Density of left out matrix after minmax scaling: ' + str(density))
    left_out_fname = network_folder + left_out_tax_id + "_leftout_network_using_" + ','.join(
        used_tax_ids) + '_version_' + str(version) + "_string.v11.0.pckl"
    left_out_feats = {}
    left_out_feats['net'] = sparse.csr_matrix(left_out_matrix)
    left_out_feats['prot_IDs'] = left_out_net_prots
    print(left_out_feats.keys())
    print('Dumping ' + left_out_fname)
    pickle.dump(left_out_feats, open(left_out_fname, 'wb'), protocol=4)
    print(
        'Making IsoRank block of leaveout species with intraspecies blast connections'
    )
    save_single_isorank_block((left_out_tax_id, left_out_tax_id),
                              alpha,
                              network_folder,
                              blast_folder,
                              block_matrix_folder,
                              False,
                              True,
                              used_tax_ids=used_tax_ids,
                              version=version)
    image1 = crop_image_from_gray(image1)

    image1 = cv2.resize(image1, (512, 512), Image.ANTIALIAS)
    return image1


#files = next(os.walk('A:/HealthAnalytics/image_Processing/Project/images'))[2]
path = '.\Project'
files = next(os.walk(path + '/Train_data'))[2]
for filename in files:
    #path='A:/HealthAnalytics/image_Processing/Project/images/'+filename
    path = path + '/Train_data' + filename
    image = cv2.imread(path)
    color_processed = preprocess_image(image)
    shape = color_processed.shape
    image_scaled = sk.minmax_scale(color_processed.ravel(),
                                   feature_range=(0, 1)).reshape(shape)
    #plt.imshow(color_processed)

    #plt.hist(color_processed.ravel(), bins=256, range=(0.0, 1.0), fc='k', ec='k') #calculating histogram
    #plt.hist(entr_img.ravel(), bins=256, range=(0.0, 1.0), fc='k', ec='k') #calculating histogram

    rgbimage = rgb2gray(image_scaled)

    entr_img = entropy(rgbimage, disk(10))

    entr_img_resize = cv2.resize(entr_img, (256, 256), Image.ANTIALIAS)
    #plt.imshow(entr_img_resize)
    image_entropy = toimage(entr_img_resize)
    #plt.imshow(image_entropy)
    image_entropy.save(path + filename)
print(train.columns)

## 2. Preparing More Features ##

from sklearn.preprocessing import minmax_scale
# The holdout set has a missing value in the Fare column which
# we'll fill with the mean.
holdout["Fare"] = holdout["Fare"].fillna(train["Fare"].mean())
columns = ['SibSp', 'Parch', 'Fare']

train["Embarked"] = train['Embarked'].fillna('S')
holdout["Embarked"] = holdout['Embarked'].fillna('S')
train = create_dummies(train, 'Embarked')
holdout = create_dummies(holdout, 'Embarked')
for col in columns:
    train[col + '_scaled'] = minmax_scale(train[col])
    holdout[col + '_scaled'] = minmax_scale(holdout[col])

## 3. Determining the Most Relevant Features ##

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

columns = [
    'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Child',
    'Age_categories_Teenager', 'Age_categories_Young Adult',
    'Age_categories_Adult', 'Age_categories_Senior', 'Pclass_1', 'Pclass_2',
    'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q',
    'Embarked_S', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled'
]
示例#36
0
'''
Created on May 23, 2016

@author: ahanagrawal
'''

import numpy as np
from sklearn import preprocessing as prep

if __name__ == '__main__':
    a = np.random.rand(5,5)
    print(a)
    print(prep.minmax_scale(a))
    

    
    # 0.56391324 - 0.0560/(0.9158 - 0.0560)
示例#37
0
def save(obj, files):
    import cPickle
    with open(files, 'wb') as f:
        cPickle.dump(obj, f)


if __name__ == '__main__':
    paras = []
    resultFile = []
    for sparess in range(5, 6, 5):
        for fileNum in range(1, 2):
            trainFile = "/root/AAA/dataset/qos/tp/train/sparseness%s/training%d.txt" % (
                sparess, fileNum)
            side = 'user'
            ws = False
            for eps in [0.01, 0.1, 1.5, 1, 2, 5, 10, 15, 20, 40, 60, 80]:
                for min_samples in [1, 2, 4, 6, 8, 10]:
                    saveFile = "/root/AAA/dataset/qos/tp/PoolCoOccurrenceMatrix-%s-%d-%d-%d-%d" % (
                        side, sparess, fileNum, eps, min_samples)
                    resultFile.append(saveFile)
                    paras.append(([trainFile, eps, min_samples, ws], None))

    threadNum = 20
    pool = threadpool.ThreadPool(threadNum)
    requests = threadpool.makeRequests(createCoOccurrenceMatrixByDBSCAN, paras)
    results = [pool.putRequest(req) for req in requests]
    pool.wait()
    for index, result in enumerate(results):
        result = minmax_scale(result, axis=1)
        np.savetxt(resultFile[index], result, delimiter='\t', fmt='%f')
示例#38
0
文件: linear.py 项目: feifeifei12/123
def minmaxscale(series):#归一化
    series_scale = preprocessing.minmax_scale(series, feature_range=(0, 1))
    return series_scale
示例#39
0
        f_path = dir_path + f
        # if os.path.isdir(f_path):
        #     continue
        # if f != 'ECG200':
        #     continue

        test_data = numpy.loadtxt(f_path, delimiter=',')
        label = test_data[:, 0]
        test_data = test_data[:, 1:]
        rows, cols = test_data.shape
        label_count = dict()
        for i in range(rows):
            label_count[label[i]] = label_count.get(label[i], 0) + 1
        test_data = scale(test_data, axis=1)

        # k = config.get(f)
        k = 8
        center, width = get_seg_info(k)
        begin = time.clock()
        t_data = preprocessing(test_data, k)
        predict = minmax_scale(cal_score(test_data, t_data, cols, k))
        end = time.clock()

        # auc = roc_auc_score(label, predict)
        score_ratio, pred = cal_score_ratio(label, predict)
        error = mean_squared_error(label, pred)
        auc = recall_score(label, pred, average='macro')
        # score_ratio = cal_score_ratio(label, predict)
        print("Data=%s, AUC=%f, error=%f, Score_ratio=%f, Time=%f" %
              (f, auc, error, score_ratio, (end - begin)))
示例#40
0
 def transform(self, topics_and_res):
     from sklearn.preprocessing import minmax_scale
     topics_and_res = topics_and_res.copy()
     topics_and_res["score"] = topics_and_res.groupby(
         'qid')["score"].transform(lambda x: minmax_scale(x))
     return topics_and_res
示例#41
0
Data_Set10 = pd.concat([Data_Set8,New_Col], axis = 1)

"""""""""""""""
Dummy Variables

"""""""""""""""
Data_Set10.info()

Data_Set11 = pd.get_dummies(Data_Set10)

Data_Set11.info()

"""""""""""""""
Normalization

"""""""""""""""

from sklearn.preprocessing import minmax_scale, normalize

# First Method: Min Max Scale

Data_Set12 = minmax_scale(Data_Set11, feature_range=(0,1))


Data_Set13 = normalize(Data_Set11, norm = 'l2', axis = 0)

# axis = 0 for normalizing features / axis = 1 is for normalizing each sample

Data_Set13 = pd.DataFrame(Data_Set13,columns = ['Time','E_Plug','E_Heat',
                                                'Price','Temp', 'OffPeak','Peak'])
示例#42
0
from sklearn.decomposition import PCA
from sklearn import preprocessing
import numpy as np
import pandas as pd

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

#データの読み込み
df = pd.read_csv("wine.csv")
x_table = df.drop(columns="Wine")
x = x_table.values
name = x_table.columns

#データの前処理
x = preprocessing.minmax_scale(x)
#モデルの定義
pca = PCA(n_components=len(x[0]))
#学習
pca.fit(x)
#寄与率
con = pca.explained_variance_ratio_
index = []
for i in range(len(con)):
    index.append("第" + str(i + 1) + "主成分")
dfc = pd.DataFrame(con)
dfc.columns = ["寄与率"]
dfc.index = index
print(dfc)

#固有ベクトル
示例#43
0
    plt.show()

all_images_rgb = ld_images("img")
all_images_hog = []
for i in range(len(all_images_rgb)):
    all_images_rgb[i] = cv2.cvtColor(all_images_rgb[i], cv2.COLOR_BGR2RGB)
    fd, hog_image = hog(all_images_rgb[i], orientations = 32, pixels_per_cell = (16, 16), cells_per_block = (1,1), visualize = True, multichannel = True)
    all_images_hog.append(fd)

train_anno = sio.loadmat('train-anno.mat')
face_landmark = train_anno['face_landmark']
trait_annotation = train_anno['trait_annotation']

total_features = np.c_[all_images_hog, face_landmark]

total_features = minmax_scale(total_features, axis = 0)

thresholds = np.mean(trait_annotation, axis = 0)
trait_labels = np.array([[1 if x >= 0 else -1 for x in trait_annotation[:,i]] for i in range(trait_annotation.shape[1])])
trait_labels = trait_labels.T

division = int(0.8 * trait_labels.shape[0])
train_data = total_features[:division,]
train_reg = trait_annotation[:division,]
train_labels = trait_labels[:division,]
test_data = total_features[division:,]
test_reg = trait_annotation[division:,]
test_labels = trait_labels[division:,]

c_range = 2**np.linspace(-5,13,10)
p_range = 2**np.linspace(-9,1,6)
示例#44
0
def normalizeScale(image, low, high):
    image = image
    shape = np.shape(image)
    newImage = minmax_scale(image.ravel(),
                            feature_range=(low, high)).reshape(shape)
    return newImage
示例#45
0
def run_palantir(
    ms_data,
    early_cell,
    terminal_states=None,
    knn=30,
    num_waypoints=1200,
    n_jobs=-1,
    scale_components=True,
    use_early_cell_as_start=False,
    max_iterations: int = 25,
):
    """Function for max min sampling of waypoints

    :param ms_data: Multiscale space diffusion components
    :param early_cell: Start cell for pseudotime construction
    :param terminal_states: List/Series of user defined terminal states
    :param knn: Number of nearest neighbors for graph construction
    :param num_waypoints: Number of waypoints to sample
    :param n_jobs: Number of jobs for parallel processing
    :param scale_components:
    :param use_early_cell_as_start:
    :param max_iterations: Maximum number of iterations for pseudotime convergence
    :return: PResults object with pseudotime, entropy, branch probabilities and waypoints
    """

    if scale_components:
        data = pd.DataFrame(
            preprocessing.minmax_scale(ms_data),
            index=ms_data.index,
            columns=ms_data.columns,
        )
    else:
        data = copy.copy(ms_data)

    # ################################################
    # Determine the boundary cell closest to user defined early cell
    dm_boundaries = pd.Index(set(data.idxmax()).union(data.idxmin()))
    dists = pairwise_distances(data.loc[dm_boundaries, :],
                               data.loc[early_cell, :].values.reshape(1, -1))
    start_cell = pd.Series(np.ravel(dists), index=dm_boundaries).idxmin()
    if use_early_cell_as_start:
        start_cell = early_cell

    # Sample waypoints
    print("Sampling and flocking waypoints...")
    start = time.time()

    # Append start cell
    if isinstance(num_waypoints, int):
        waypoints = _max_min_sampling(data, num_waypoints)
    else:
        waypoints = num_waypoints
    waypoints = waypoints.union(dm_boundaries)
    if terminal_states is not None:
        waypoints = waypoints.union(terminal_states)
    waypoints = pd.Index(waypoints.difference([start_cell]).unique())

    # Append start cell
    waypoints = pd.Index([start_cell]).append(waypoints)
    end = time.time()
    print("Time for determining waypoints: {} minutes".format(
        (end - start) / 60))

    # pseudotime and weighting matrix
    print("Determining pseudotime...")
    pseudotime, W = _compute_pseudotime(data, start_cell, knn, waypoints,
                                        n_jobs, max_iterations)

    # Entropy and branch probabilities
    print("Entropy and branch probabilities...")
    ent, branch_probs = _differentiation_entropy(data.loc[waypoints, :],
                                                 terminal_states, knn, n_jobs,
                                                 pseudotime)

    # Project results to all cells
    print("Project results to all cells...")
    branch_probs = pd.DataFrame(
        np.dot(W.T, branch_probs.loc[W.index, :]),
        index=W.columns,
        columns=branch_probs.columns,
    )
    ent = branch_probs.apply(entropy, axis=1)

    # UPdate results into PResults class object
    res = PResults(pseudotime, ent, branch_probs, waypoints)

    return res
示例#46
0
#threshold values for 2 classes [-0.0317 -0.0132]
#print (thresholds.values())

for av in all_av:
    un_nm_scores = []
    
    for tc in labels[:len(trained_classes)]:
        clf = isolation_forests[tc]
        un_nm_scores.append(clf.decision_function(av.reshape(1,-1))[0])
	
    all_unnormalized_scores.append(un_nm_scores)

#add thresholds
all_unnormalized_scores.append(thresholds.values())
from sklearn.preprocessing import minmax_scale
f = minmax_scale(np.array(all_unnormalized_scores))
scores = f[:-1]
thres = f[-1]
iso_pred = []

print("scores", scores[:10])
for v in scores:
#     temp = []
    thres_max = {}
    for i, s in enumerate(v):
        if s > thres[i]:
            thres_max[i] = s - thres[i]
    if len(thres_max) == 0:
        iso_pred.append(120)
    else:
        iso_pred.append(max(thres_max, key=thres_max.get))
示例#47
0
# https://www.epfl.ch/labs/mmspg/research/page-58317-en-html/page-58332-en-html/page-58333-en-html/iqa/

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

df = pd.read_csv('JPEGXR.VQMT.csv')
df['mos'] = minmax_scale(df['mos'], feature_range=(1,5), axis=0)

minMOS = df['mos'].min()
maxMOS = df['mos'].max()
series = np.linspace(maxMOS, minMOS, 20)

results = pd.DataFrame()
for i in series:
	filtered = df[['mos', 'DSSIM', 'SSIMULACRA', 'Butteraugli', 'Butteraugli_XL', 'Butteraugli_XL_3m', 'Butteraugli_XL_2s', 'Butteraugli_XL_3s', 'Butteraugli_XL_6s', 'Butteraugli_XL_12s']][df['mos'] >= i]
	results = results.append(filtered.corr('spearman')[['mos']].T.reset_index(), ignore_index=True, sort=False)

results.index = series
results = results.drop(['mos', 'index'], axis=1)
results = results.dropna(thresh=1)

print(results)

plt.figure(figsize=(1920/96, 1080/96), dpi=96)
#plt.plot(results.index, results['DSSIM'], label='DSSIM')
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale # 정규화
from sklearn.metrics import mean_squared_error
import tensorflow as tf 

# 1. data load
boston = load_boston()
print(boston) # "data", "target"

# 2. 변수 선택  
X = boston.data # 정규화 
y = boston.target
X.shape # (506, 13)
y_nor = minmax_scale(y)

# train/test split(70 vs 30)
x_train, x_test, y_train, y_test = train_test_split(
        X, y_nor, test_size=0.1, random_state=123)


tf.random.set_seed(123)

# 2. Model 클래스 : model = input * w + b
class Model(tf.keras.Model): # keras Model class 상속 
  def __init__(self): # 생성자 
    super(Model, self).__init__()
    self.w = tf.Variable(tf.random.uniform(shape=[13, 1]))
    #type(self.w) # dtype=float32
    self.b = tf.Variable(tf.zeros(shape=[1]))
                                          'yeast_net_genes.csv'),
                             header=None)

df = pd.read_csv(os.path.join(os.path.expandvars('$AGAPEDATA'),
                              'pombeallpairs..genexp.txt'),
                 sep='\t')
df = df.dropna()

cols = ['Gene1', 'Gene2']

for c in cols:
    df[c] = df[c].apply(lambda x: f'4896.{x}.1')

string_proteins = string_indices[0].values
df = df[(df.Gene1.isin(string_proteins)) & (df.Gene2.isin(string_proteins))]

mapping = dict(zip(string_indices[0], string_indices[1]))

for c in cols:
    df[c] = df[c].apply(lambda x: mapping[x])

df = df.sort_values(by=cols)

df.Expression_correlation = minmax_scale(df.Expression_correlation)

df.to_csv(os.path.join(output_path,
                       f"yeast_z_gene_expression_meta-analysis_adjacency.txt"),
          sep="\t",
          index=False,
          header=False)
    driver = ph.DriverEstim()(eda)
    # driver.plot()
    # plt.show()

    phasic, tonic, _ = ph.PhasicEstim(delta=0.02)(driver)
    # phasic.plot()
    # tonic.plot()
    # plt.show()

    #print(eda.shape)
    M = 20
    splitArr = np.array_split(eda, eda.shape[0] / M)
    res = np.array([item.mean() for item in splitArr])

    res = minmax_scale(res)
    #print(res)
    #print(res.shape[0])

    #aM, l5, numAM, numL5 = getMoments(res)

    segs = np.array_split(res, segements)
    std = np.std(res)
    #print(segs)

    file = open(base + ".txt", "w+")

    plt.plot(res)
    plt.plot(segs[0], 'r')
    plt.show()
示例#51
0
from pylab import *
import pandas as pd
from sklearn.preprocessing import minmax_scale
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv('train.csv', index_col='Id')
X = minmax_scale(train.ix[:, :-1])
y = array(train.ix[:, -1])

test = pd.read_csv('test.csv', index_col='Id')
A = minmax_scale(test)

f = RandomForestClassifier(n_estimators=1000, n_jobs=12)
f.fit(X, y)
p = f.predict(A)

s = pd.read_csv('sampleSubmission.csv', index_col='Id')
s.Cover_Type = p
s.to_csv('sampleSubmission.csv')
# score=0.74480 using RandomForestClassifier;  0.58160 2yrs ago
# scaling to [0,1] made some difference

# NN
from sklearn.neural_network import MLPClassifier

f = MLPClassifier(hidden_layer_sizes=(80, 20), alpha=1e-5, random_state=1)
f.fit(X, y)
p = f.predict(A)

s = pd.read_csv('sampleSubmission.csv', index_col='Id')
s.Cover_Type = p
# In[127]:

N_train = 1331
training_inputs = data[0:N_train, :, :].astype('float32')
val_inputs = data[(N_train + 1):-1, :, :].astype('float32')
training_inputs = training_inputs.reshape(
    (len(training_inputs), np.prod(training_inputs.shape[1:])))
val_inputs = val_inputs.reshape(
    (len(val_inputs), np.prod(val_inputs.shape[1:])))

# In[128]:

from sklearn.preprocessing import minmax_scale

training_inputs = minmax_scale(training_inputs, feature_range=(0, 1), axis=1)
val_inputs = minmax_scale(val_inputs, feature_range=(0, 1), axis=1)
training_targets = targets[0:N_train, :]
val_targets = targets[(N_train + 1):-1, :]
t_targets = target_arr[0:N_train, :]
v_targets = target_arr[(N_train + 1):-1, :]

# In[129]:

percent_noisy = 0.5
indices_tozero = np.random.choice(
    range(training_inputs.shape[0] * training_inputs.shape[1]),
    int(percent_noisy * training_inputs.shape[0] * training_inputs.shape[1]),
    replace=False)
training_inputs_noisy = training_inputs.copy()
np.put(training_inputs_noisy, indices_tozero, 0)
示例#53
0
def main(time_points):
    t0 = time.time()
    #np.random.seed(seed=2018)
    #random.seed(2018)
    print("Number of time_Points is " + str(time_points))
    ###########################################################################################################
    ## These hyperparameters bounds how many slices is extracted from each meter, min and max number ##########
    #############################################################################################################
    number_of_training_slices = 200
    min_number_of_training_slices = 1
    number_of_test_slices = 200
    min_number_of_test_slices = 200

    zeros_slice_percentage = 0.2  # Percentage of zeros that is allowed for each slice
    ##########################################################################

    if number_of_test_slices == min_number_of_test_slices:
        # Set to true if min = max in number of test slices it will balance the meters
        testing_voting = True
    else:
        testing_voting = False

    # Change to output path of "Excelmerge.py"
    path = "~\Concatenated_File_total.csv"
    #Hard Threshholding for removing outliers
    df = pd.read_csv(path, sep=';', header=None)
    data = df.values
    shape = np.shape(data)
    # Shuffeling the data
    if (False):
        # COunting nans
        print("Counting NanS")
        nan_percentage = []
        color_array = []
        for i in range(np.shape(data)[1]):
            nan_percentage.append(count_nans(data[:, i]))
            color_array.append(1 / data[0, i])
            print(" Counting:  " + str(i))
        x = np.arange(0, i + 1)
        plt.scatter(x, nan_percentage, c=color_array)
        plt.xlabel(" Meter ")
        plt.ylabel(" Missing Value Quotient ")
        plt.show()
    data = shuffle_data(data)

    print("Number of Time Series is:" + str(np.shape(data)[1]))
    print(np.any(np.isnan(data)))
    max_length = np.shape(data)[0]
    number_of_classes = len(np.unique(data[0, :]))

    if number_of_classes == 5:
        class_names = [" 1 ", " 2 ", "3", "4", " 5 "]
    else:
        class_names = [" 1", " 2 ", "3", " 4 "]
    training_percentage = 0.8
    threshhold = 1e16  # To remove outliers
    indices = data[:, :] < threshhold
    data[indices == False] = -1

    data_test = data[:, int(training_percentage * shape[1]):
                     shape[1]]  # For writing the test-time series to file

    # Training Data
    y = data[0, 0:int(training_percentage * shape[1])]
    X = data[1:shape[0], 0:int(training_percentage * shape[1])]

    # Test Data                                                                           # Shall be written directly to file or not, not
    X_test = data[1:max_length, int(training_percentage * shape[1]):shape[1]]
    y_test = data[0, int(training_percentage * shape[1]):shape[1]]

    ##########################################################################################################
    ############# Next we create the chunks of size time_points which will be stored as columns in ###########
    ############# X_resampled with corresponding label in y_resampled FOR TRAINING SET             ###########
    ############################################################################################333
    number_of_chunks_per_customer = []
    X_resampled = []  # Stores the data
    y_resampled = []  # Stores the label
    meter_resampled = []  # Stores the meter value (Ranging from 0-shape[1])
    left_out_slice = []
    booleans = X[:, :] >= 0
    booleans2 = X[:, :] == 0
    boolvec = []
    for j in range(np.shape(X)[1]):  # Looping over the samples
        label = y[j]
        count = 0
        for i in range(
                int(np.shape(X)[0] / time_points)
        ):  # Looping over the number of chunks assuming no overlap



            if  np.all(booleans[i*time_points:i*time_points+time_points,j]) == True and \
            np.sum(booleans2[i*time_points:i*time_points+time_points,j]) <= time_points*zeros_slice_percentage and\
            np.var(X[i*time_points:i*time_points+time_points,j]) > 1e-3:

                var = np.var(
                    X[i * time_points:i * time_points + time_points,
                      j])  # Adding some augmentation to the data random noise

                X_resampled.append(
                    X[i * time_points:i * time_points + time_points, j]
                )  #+ np.random.normal(0,np.sqrt(var),np.shape(X[i*time_points:i*time_points+time_points,j])))
                y_resampled.append(label)

                meter_resampled.append(j)

                count = count + 1

            elif np.sum(booleans[i*time_points:i*time_points+time_points,j]) > 0.8*time_points and \
                np.sum(booleans2[i*time_points:i*time_points+time_points,j]) <= time_points*zeros_slice_percentage and \
                np.var(X[i*time_points:i*time_points+time_points,j]) > 1e-3:
                # Filling NaN's with interpolation if the number of NaN's per slice is below some limit

                out = interp(X[i * time_points:i * time_points + time_points,
                               j])

                X_resampled.append(out)

                y_resampled.append(label)

                meter_resampled.append(j)

                count = count + 1

        number_of_chunks_per_customer.append(count)

    #######################  Creates dataset of slices for test dataset    ###############################
    number_of_chunks_per_customer_test = []
    X_resampled_test = []
    y_resampled_test = []
    meter_resampled_test = []
    booleans_test = X_test[:, :] >= 0
    booleans2_test = X_test[:, :] == 0
    for j in range(np.shape(X_test)[1]):  # Looping over the samples
        label_test = y_test[j]
        count = 0

        for i in range(
                int(np.shape(X_test)[0] / time_points)
        ):  # Looping over the number of chunks assuming no overlap

            if np.all(booleans_test[
                    i * time_points:i * time_points + time_points,
                    j]) == True and np.sum(booleans2_test[
                        i * time_points:i * time_points + time_points,
                        j]) <= time_points * zeros_slice_percentage:
                if np.var(
                        X_test[i * time_points:i * time_points + time_points,
                               j]
                ) > 1e-3:  # Avoid having all the same inputs, want to capture some patterns

                    X_resampled_test.append(
                        X_test[i * time_points:i * time_points + time_points,
                               j])
                    count = count + 1
                    y_resampled_test.append(label_test)
                    meter_resampled_test.append(j)
        number_of_chunks_per_customer_test.append(count)

    #Next we build the matrix of slices

    X_resampled = np.stack(X_resampled, axis=-1)  # Stacked data
    X_resampled_test = np.stack(X_resampled_test, axis=-1)  # Stacked data

    #Putting together in one big matrix so it is easier to shuffle.
    X_big = np.concatenate([[y_resampled], X_resampled], axis=0)
    X_big = np.concatenate([[meter_resampled], X_big], axis=0)
    X_big_test = np.concatenate([[y_resampled_test], X_resampled_test], axis=0)
    X_big_test = np.concatenate([[meter_resampled_test], X_big_test], axis=0)

    # Shuffle X_big
    X_big = shuffle_data(X_big)

    meter_range = int(np.max(np.unique(X_big[0, :])))
    meter_range_test = int(np.max(np.unique(X_big_test[0, :])))

    # Create Loop for taking max number of samples from each meter

    X_resampled = [[] for _ in range(meter_range + 1)]
    X_resampled_test = [[] for _ in range(meter_range_test + 1)]
    count_meter = np.zeros(meter_range + 1)  # Number of meters in training set
    count_meter_test = np.zeros(meter_range_test + 1)
    count_meter_class = [
        np.zeros(meter_range_test + 1) for _ in range(number_of_classes)
    ]

    for i in range(np.shape(X_big)[1]):  # Go through every sample
        index = int(X_big[0, i])  # Converts Float to integer for indexing
        if count_meter[
                index] < number_of_training_slices:  # checking if we have no more than max_number of slices of meter index.
            X_resampled[index].append(X_big[:, i])
            count_meter[index] = count_meter[
                index] + 1  # Keeps track of number of samples extracted from each meter

    print(len(X_resampled))
    for i in range(np.shape(X_big_test)[1]):
        index = int(X_big_test[0, i])
        classtmp = int(X_big_test[1, i]) - 1
        if count_meter_test[index] < number_of_test_slices:
            X_resampled_test[index].append(X_big_test[:, i])
            count_meter_test[index] = count_meter_test[index] + 1
            count_meter_class[classtmp][
                index] = count_meter_class[classtmp][index] + 1
    del (X_big)
    del (X_big_test)
    # Want to create a indgurk2 for each classes separately and make histograms of that,
    # This will show the distribution of the number of slices over the meters for each class
    # Gives " Quality measure " for the meters in each class
    gurk1 = np.asarray(np.where(count_meter < min_number_of_training_slices))
    gurk2 = np.asarray(np.where(count_meter_test < min_number_of_test_slices))
    for k in range(number_of_classes):
        rem = np.where(count_meter_class[k] >= min_number_of_test_slices)
        indgurk2 = count_meter_test[rem]
        plt.hist(indgurk2)
        plt.title("Class: " + class_names[k])
        plt.xlabel("Number of Slices")
        plt.ylabel("Number of Meters")
        #plt.show()
    #########################################################################
    # Delete the lists i.e meters containing less than min number of slices #
    #########################################################################
    for i in sorted(np.squeeze(gurk1).tolist(), reverse=True):
        del (X_resampled[i])
    for i in sorted(np.squeeze(gurk2).tolist(), reverse=True):
        del (X_resampled_test[i])

    X_big = np.stack(list(itertools.chain.from_iterable(X_resampled)), axis=-1)

    if (testing_voting == False):
        X_big_test = np.stack(list(
            itertools.chain.from_iterable(X_resampled_test)),
                              axis=-1)

    ##### Need to sort X_big_test by classes in the case where min = max:

    if (testing_voting == True):
        labels = []
        # Assuming now that each meter has the sane number of slices we will find the class with the fewest number of meters
        for k in range(len(X_resampled_test)):
            labels.append(
                X_resampled_test[k][0][1]
            )  #[k]: meter [0] take first slice [1] class of slices, same for all slices in meter k

        # Labels contain the labels for all the meters in the test set.
        a, return_index, return_counts = np.unique(labels,
                                                   return_index=True,
                                                   return_counts=True)
        # Return counts will give us the smallest number of meters in a class.
        min_number_of_samples_test = np.max(return_counts)
        print(
            "The number of samples in each class in test set is distributed as follows"
        )
        print(return_counts)
        # Go through X_resampled_test again and remove meters from the not smallest class until the dataset is balanced
        count_classes = np.zeros(number_of_classes)
        for k in sorted(np.arange(0, len(X_resampled_test)), reverse=True):
            class_index = int(X_resampled_test[k][0][1]) - 1
            if count_classes[class_index] >= min_number_of_samples_test:
                del (X_resampled_test[k])

            count_classes[class_index] = count_classes[class_index] + 1
            # Need to add to count_classes[index]!!!
        data_shuffeled_test = np.stack(list(
            itertools.chain.from_iterable(X_resampled_test)),
                                       axis=-1)

    # Sorting the datamatrices in order to take same number of samples from each class
    X_big = X_big[:, np.argsort(X_big[1, :])]
    print(X_big)

    a, return_index, return_counts = np.unique(X_big[1, :],
                                               return_index=True,
                                               return_counts=True)

    print("There are a total of " + str(np.shape(X_big)[1]) +
          " samples distributed as follows.")
    print(return_counts)
    print(return_index)

    print()
    print()
    if (testing_voting == False):
        X_big_test = X_big_test[:, np.argsort(X_big_test[1, :])]
        a_test, return_index_test, return_counts_test = np.unique(
            X_big_test[1, :], return_index=True, return_counts=True)
        print(
            "The number of samples in each class in test set is distributed as follows"
        )
        print(return_counts_test)
        print(return_index_test)
        print()
        min_number_of_samples_test = np.min(return_counts_test)
        data_shuffeled_test = X_big_test
    #Building final dataset of min_number_of_samples per class, randomly sampled and shuffeled
    min_number_of_samples = np.min(
        return_counts)  # The number of samples from the smallest class

    indexvector = np.arange(0, np.shape(X_big)[1])  # Indices of all samples

    data_shuffeled = np.ones([
        time_points + 2, number_of_classes * min_number_of_samples
    ])  #Preallocate memory for final data array.

    for i in range(number_of_classes):
        #Picks min_number_of_samples random indices from each class and puts them in the final data array.
        indexchoice = np.random.choice(indexvector[return_index[i]:return_index[i] + return_counts[i]],\
                                       min_number_of_samples,replace=False)
        data_shuffeled[:,i*min_number_of_samples:(i+1)*min_number_of_samples] = \
        X_big[:,indexchoice]

    print(np.any(np.isnan(data_shuffeled)))
    print(np.any(np.isnan(data_shuffeled_test)))
    # For checking that this works
    a2, return_index2, return_counts2 = np.unique(data_shuffeled[1, :],
                                                  return_index=True,
                                                  return_counts=True)

    a2_test, return_index2_test, return_counts2_test = np.unique(
        data_shuffeled_test[1, :], return_index=True, return_counts=True)

    print("The number of samples in each class is " +
          str(min_number_of_samples) + ".")
    print(return_counts2)
    print(return_index2)
    print()
    print()

    print("The number of samples in each class for Test set is " + str() + ".")
    print(return_counts2_test)
    print(return_index2_test)
    print()
    print()

    # Normalization

    data_shuffeled[2:np.shape(data_shuffeled)[0],:] = \
        minmax_scale(data_shuffeled[2:np.shape(data_shuffeled)[0],:],feature_range = (0,1),axis = 0,copy=False)

    data_shuffeled_test[2:np.shape(data_shuffeled_test)[0],:] = \
        minmax_scale(data_shuffeled_test[2:np.shape(data_shuffeled_test)[0],:],feature_range=(0,1),axis=0,copy=False)

    print("Check for NaN's after feature scaling")
    print(np.any(np.isnan(data_shuffeled)))
    print(np.any(np.isnan(data_shuffeled_test)))
    # Final shuffle of the training data
    data_shuffeled = shuffle_data(data_shuffeled)
    data_shuffeled_test = shuffle_data(data_shuffeled_test)
    # Extract final data, write to csv

    y_one_hot = np.zeros([number_of_classes, np.shape(data_shuffeled)[1]])
    for k in range(np.shape(data_shuffeled)[1]):
        for p in range(number_of_classes):
            if p + 1 == data_shuffeled[1, k]:
                y_one_hot[p, k] = 1

    y_one_hot_test = np.zeros(
        [number_of_classes,
         np.shape(data_shuffeled_test)[1]])
    for k in range(np.shape(data_shuffeled_test)[1]):
        for p in range(number_of_classes):
            if p + 1 == data_shuffeled_test[1, k]:
                y_one_hot_test[p, k] = 1

    # Write processed data into a file.
    filename = "~\Final_Data" + str(time_points)
    if os.path.exists(
            filename + ".csv"
    ):  #Checks if file exists and if true removes this makes sure it overwrites it.
        print("Removing old " + filename + ".csv" + " before writing new.")
        os.remove(filename + ".csv")
    np.savetxt(filename + ".csv", data_shuffeled,
               delimiter=';')  # Prints data to file
    print("Final_Data Length is " + str(np.shape(data_shuffeled)[1]))

    filename_y_one_hot = "~\Y_One_Hot" + str(time_points)
    if os.path.exists(
            filename_y_one_hot + ".csv"
    ):  #Checks if file exists and if true removes this makes sure it overwrites it.
        print("Removing old " + filename_y_one_hot + ".csv" +
              " before writing new.")
        os.remove(filename_y_one_hot + ".csv")
    np.savetxt(filename_y_one_hot + ".csv", y_one_hot,
               delimiter=';')  # Prints data to file

    print("Final Test Data Length is " + str(np.shape(data_shuffeled_test)[1]))
    filename_test = "~\Test_Data" + str(time_points)
    if os.path.exists(filename_test + ".csv"):
        print("Removing old " + filename_test + ".csv" +
              " before writing new.")
        os.remove(filename_test + ".csv")
    np.savetxt(filename_test + ".csv", data_shuffeled_test, delimiter=';')

    filename_y_one_hot_test = "~\Y_One_Hot_Test" + str(time_points)
    if os.path.exists(
            filename_y_one_hot_test + ".csv"
    ):  #Checks if file exists and if true removes this makes sure it overwrites it.
        print("Removing old " + filename_y_one_hot_test + ".csv" +
              " before writing new.")
        os.remove(filename_y_one_hot_test + ".csv")
    np.savetxt(filename_y_one_hot_test + ".csv", y_one_hot_test,
               delimiter=';')  # Prints data to file

    t1 = time.time()
    print("Code ran in:" + str(np.round((t1 - t0) / 60, decimals=3)) +
          " minutes.")
    return 0
示例#54
0
    a,sr=librosa.load(path)
    a=a[:100000]#サイズ合わせ
    y=librosa.feature.mfcc(y=a,sr=sr)
    data.append(y)
    label.append(0)

for i in range(10):
    path="フォルダ名"+str(i+1)+"拡張子名"
    a,sr=librosa.load(path)
    a=a[:100000]#サイズ合わせ
    y=librosa.feature.mfcc(y=a,sr=sr)
    data.append(y)
    label.append(1)
for i in range(len(data)):
    data[i]=sum(data[i])/len(data[i])
data = preprocessing.minmax_scale(data)

#データの分割
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3)

#学習
clf=clf=MLPClassifier(hidden_layer_sizes=(500,500,250))
clf.fit(x_train, y_train)

#予測
y_pred=clf.predict(x_test)

#精度の検証
print("Accuracy")
print(accuracy_score(y_test,y_pred))
print("Precision")
示例#55
0
 def normalize(dataset):
     return preprocessing.minmax_scale(dataset, feature_range=(0, 1))
# Encode the result
encoder = LabelEncoder()
source['Result'] = encoder.fit_transform(source['Result'])
print(source['Result'].unique())  # Shows that all the values are encoded

# Convert training data to numeric
source['Percentage'] = source['Percentage'].apply(
    pd.to_numeric,
    errors='coerce')  #coerce would change the non numeric to NaN
source['Percentage'].fillna(round(source['Percentage'].mean(), 2),
                            inplace=True)  # Change NaN to mean value

# Feature Scaling
#max = source['Percentage'].max()
source['GRE'] = preprocessing.minmax_scale(source['GRE'], feature_range=(0, 1))
source['GRE (Quants)'] = preprocessing.minmax_scale(source['GRE (Quants)'],
                                                    feature_range=(0, 1))
source['AWA'] = preprocessing.minmax_scale(source['AWA'], feature_range=(0, 1))
source['TOEFL'] = preprocessing.minmax_scale(source['TOEFL'],
                                             feature_range=(0, 1))
source['Work-Ex'] = preprocessing.minmax_scale(source['Work-Ex'],
                                               feature_range=(0, 1))
source['International Papers'] = preprocessing.minmax_scale(
    source['International Papers'], feature_range=(0, 1))
source['Percentage'] = preprocessing.minmax_scale(source['Percentage'],
                                                  feature_range=(0, 1))

# Test Train Split
features = [
    'GRE', 'GRE (Quants)', 'AWA', 'TOEFL', 'Work-Ex', 'International Papers',
示例#57
0
文件: predict.py 项目: averak/CADNN
    key = input()

    if mode == 0:
        # 録音開始
        print('===== {0} START ==============='.format(cnt))
        record.record_start.set()
        record.record_end.clear()
        mode = 1

    else:
        # 録音終了
        print('===== END ===============')
        record.record_start.clear()
        while not record.record_end.is_set():
            pass
        mode = 0
        cnt += 1

        x1 = []
        x2 = []
        wav, fs = librosa.load('tmp/voice.wav', sr=8000)
        context_feature = librosa.feature.mfcc(wav, sr=fs, hop_length=10**6, htk=True).T[0]
        mfcc = librosa.feature.mfcc(wav, sr=fs, n_mfcc=32).T
        for frame in mfcc:
            x1.append(preprocessing.minmax_scale(frame))
            x2.append(context_feature)

        pred = [classes[np.argmax(p)] for p in model.predict([x1, x2])]
        print(pred)

示例#58
0
 def scale_min_max(a: np.array):
     return minmax_scale(a)
示例#59
0
##############################
kidtx = pd.read_csv(
    'GBMLGG_new_time_methylation_mRNA10_all_lmqcm_gamma=none_minClusterSize=10_202012024_2.csv'
)
dataX1 = kidtx.drop(["Unnamed: 0", "ID", "vital_status", "days"],
                    axis=1).values
#y = np.transpose(np.round(np.array(kidtx["days"]/30),2), kidtx["vital_status"]) # V1=time; erged_data33=status

[m0, n0] = dataX1.shape
dataX = np.asarray(dataX1)
#dataX =minmax_scale(dataX )
data_mRNA = dataX1[:, 0:36]
data_methylation = dataX1[:, 36:n0]
#dataX=data_mRNA
#dataX=data_methylation
dataX = minmax_scale(dataX)
[m, n] = dataX.shape
[m1, n1] = data_methylation.shape
[m2, n2] = data_mRNA.shape

dataX = dataX.reshape(m, 1, n)
x = dataX
data_methylation = data_methylation.reshape(m1, 1, n1)
data_mRNA = data_mRNA.reshape(m2, 1, n2)

ytime = np.round(np.array(kidtx["days"] / 30),
                 1)  #np.transpose(np.array(kidtx["days"]))/30 # only V1=time;
ystatus = np.transpose(np.array(
    kidtx["vital_status"]))  #only erged_data33=status
y = np.transpose([ytime, ystatus])
示例#60
0
    ('Data after standard scaling', StandardScaler().fit_transform(X)),
    ('Data after min-max scaling', MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)),
    ('Data after robust scaling',
     RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Data after power transformation (Box-Cox)',
     PowerTransformer(method='box-cox').fit_transform(X)),
    ('Data after quantile transformation (gaussian pdf)',
     QuantileTransformer(output_distribution='normal').fit_transform(X)),
    ('Data after quantile transformation (uniform pdf)',
     QuantileTransformer(output_distribution='uniform').fit_transform(X)),
    ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)),
]

# scale the output between 0 and 1 for the colorbar
y = minmax_scale(y_full)


def create_axes(title, figsize=(16, 6)):
    fig = plt.figure(figsize=figsize)
    fig.suptitle(title)

    # define the axis for the first plot
    left, width = 0.1, 0.22
    bottom, height = 0.1, 0.7
    bottom_h = height + 0.15
    left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.1]
    rect_histy = [left_h, bottom, 0.05, height]