Exemplo n.º 1
0
def test_compare_with_sklearn(svd_solver, batch_number):
    X = iris.data
    X_da = da.from_array(X, chunks=(3, -1))
    batch_size = X.shape[0] // batch_number
    ipca = sd.IncrementalPCA(n_components=2, batch_size=batch_size)
    ipca.fit(X)
    ipca_da = IncrementalPCA(
        n_components=2, batch_size=batch_size, svd_solver=svd_solver
    )
    ipca_da.fit(X_da)
    np.testing.assert_allclose(ipca.components_, ipca_da.components_, atol=1e-13)
    np.testing.assert_allclose(
        ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13
    )
    np.testing.assert_allclose(
        ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13
    )
    np.testing.assert_allclose(
        ipca.explained_variance_ratio_, ipca_da.explained_variance_ratio_, atol=1e-13
    )
    if svd_solver == "randomized":
        # noise variance in randomized solver is probabilistic.
        assert_almost_equal(ipca.noise_variance_, ipca_da.noise_variance_, decimal=1)
    else:
        np.testing.assert_allclose(
            ipca.noise_variance_, ipca_da.noise_variance_, atol=1e-13
        )
Exemplo n.º 2
0
def main():

    df = pd.read_csv("data.csv").dropna()

    inputs = zip(list(df['refcode']), list(df['smiles']))
    print len(inputs)
    with futures.ProcessPoolExecutor(max_workers=7) as executor:
        a = [i for i in executor.map(get_fps, inputs) if i[0] is not None]
        print len(a)

    fps, refcodes = zip(*a)
    X = np.array(fps)
    cutoff = 0.999
    sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff)))
    X2 = sel.fit_transform(X)
    pca = decomposition.IncrementalPCA(n_components=3, batch_size=50000)
    X_pca = pca.fit_transform(X2)  # Dense data required

    print X_pca.explained_variance_

    x = X_pca[:, 0]
    y = X_pca[:, 1]

    out_df = pd.DataFrame({"x": x, "y": y, "refcode": refcodes})

    out_df.to_csv("pca_data_3d.csv")
Exemplo n.º 3
0
def main_plot_pca(data, n_comps):
    print "training the PCA model"
    pca = decomposition.IncrementalPCA(n_components=n_comps, batch_size=400)
    data_trans = pca.fit_transform(data)
    print "pca model trained, variance:"
    print(pca.explained_variance_ratio_)
    return pca, data_trans
    '''plt.figure(1, figsize=(4, 3))
Exemplo n.º 4
0
def pca_incremental(X, PC=2):
    print "PCA....."
    print("Incremental PCA, using %3d principal components" % PC)
    scaler = preprocessing.StandardScaler().fit(X)
    X_centered = scaler.transform(X)
    X_pca = decomposition.IncrementalPCA(
        n_components=PC).fit_transform(X_centered)
    return X_pca
Exemplo n.º 5
0
def my_pca(features, labels, num_pca):
    pca = decomposition.IncrementalPCA(batch_size=50)
    pca.n_components = num_pca
    pca.fit(features)
    X_reduced = pca.transform(features)
    print(len(np.unique(labels)))
    print('PCA is fitted')
    return pca, X_reduced
Exemplo n.º 6
0
    def __init__(self, type='auto', *args, **kwargs):

        if type == 'auto':
            self.model = decomposition.PCA(*args, **kwargs)
        elif type == 'incremental':
            self.model = decomposition.IncrementalPCA(*args, **kwargs)
        elif type == 'kernel':
            self.model = decomposition.KernelPCA(*args, **kwargs)
        else:
            raise ValueError('The type ', type, ' does not exist.')
Exemplo n.º 7
0
 def IncrementalPCA(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = decomposition.IncrementalPCA(n_components=2)
     result = {}
     result['data'] = pca.fit_transform(data_source)
     params = 0.0
     for j in pca.explained_variance_ratio_:
         params = params + j
     result['params'] = params
     return result
Exemplo n.º 8
0
def incrementalPCA(*data):
    X, y = data
    fig = plt.figure()
    # IncrementalPCA用于超大规模数据,将数据分批加载进内存
    ax1 = fig.add_subplot(1, 1, 1)
    incPCA = decomposition.IncrementalPCA(n_components=2,batch_size=10)
    incPCA.fit(X)
    newData1=incPCA.transform(X)
    color = ['blue', 'black', 'red']
    for i in range(newData1.shape[0]):
        ax1.scatter(newData1[i, 0], newData1[i, 1], color=color[y[i]])
    ax1.set_title('3 class after dimensionality reduction using IncrementalPCA')
    ax1.legend(loc='best')
    plt.show()
Exemplo n.º 9
0
def PPA_batches(matrix, batch_size):
    # PCA to get Top Components
    n = matrix.shape[1]
    pca = decomposition.IncrementalPCA(n_components=n, batch_size=batch_size)
    X_train = matrix - np.mean(matrix)
    X_fit = pca.fit_transform(X_train)
    U1 = pca.components_

    z = []

    # Removing Projections on Top Components
    for i, x in enumerate(X_train):
        for u in U1[0:7]:
            x = x - np.dot(u.transpose(), x) * u
        z.append(x)

    return np.asarray(z)
Exemplo n.º 10
0
 def test_train_incremental_pca(self):
     iris = datasets.load_iris()
     iris = iris.data[: BATCH_SIZE * BATCHES]
     pca = decomposition.IncrementalPCA(
         n_components=COMPONENTS, batch_size=BATCH_SIZE
     )
     pca.fit(iris)
     proj_ref = pca.transform(iris)
     batch = tf.placeholder(dtype=tf.float32, shape=[BATCH_SIZE, FEATURES])
     step = tf.train.get_or_create_global_step()
     train_op, _ = train_incremental_pca(step, batch, COMPONENTS)
     with self.test_session() as sess:
         sess.run(tf.initialize_all_variables())
         for iris_batch in np.split(iris, BATCHES):
             sess.run(train_op, feed_dict={batch: iris_batch})
         proj = sess.run(incremental_pca(iris, COMPONENTS))
     self.assertAllClose(proj, proj_ref)
Exemplo n.º 11
0
def my_pca(features, labels, num_pca):
    #mean_feature = np.mean(features, axis=0)
    #normalized_features = features - mean_feature
    #pca = decomposition.PCA(svd_solver='randomized')
    pca = decomposition.IncrementalPCA(batch_size=50)
    pca.n_components = num_pca
    pca.fit(features)
    print('Remained variance is:') 
    print(pca.explained_variance_ratio_) 
    X_reduced = pca.transform(features)
    print('PCA is fitted')
    if num_pca==2:
        plt.figure()
        for c, i in zip("rgbcmyk", [0, 1, 2, 3, 4, 5, 6]):
            plt.scatter(X_reduced[labels == i, 0], X_reduced[labels == i, 1], c=c)
        plt.title('PCA of features')
    return pca, X_reduced
def fit_pca_to_macs(loader, device, log_interval, whiten):
    """Extract MACs and use them to fit a sklearn PCA"""
    print('Fitting PCA to whiten MACs...')
    feature_transform = feature_transforms.MACStackTransform()
    pca = None
    for idx, features in enumerate(
            transform_features(feature_transform, loader, device)):
        features = features.cpu().numpy()
        if pca is None:
            # Lazy init
            num_features = features.shape[1]
            pca = decomposition.IncrementalPCA(n_components=num_features,
                                               whiten=whiten)
        pca.partial_fit(features)
        if idx % log_interval == 0:
            print('Fitted batch {}/{}'.format(idx, len(loader)))

    return pca
Exemplo n.º 13
0
def test_ipca():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    n_components = 2
    ipca = decomposition.IncrementalPCA(n_components=n_components,
                                        batch_size=10)
    X_ipca_org = ipca.fit_transform(X)

    mean = ipca.mean_
    components = ipca.components_
    X_ipca = X - mean
    X_ipca = np.dot(X_ipca, components.T)

    pca = decomposition.PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)

    colors = ['navy', 'turquoise', 'darkorange']

    for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
        plt.figure(figsize=(8, 8))
        for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
            plt.scatter(X_transformed[y == i, 0],
                        X_transformed[y == i, 1],
                        color=color,
                        lw=2,
                        label=target_name)

        if "Incremental" in title:
            err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
            plt.title(title + " of iris dataset\nMean absolute unsigned error "
                      "%.6f" % err)
        else:
            plt.title(title + " of iris dataset")
        plt.legend(loc="best", shadow=False, scatterpoints=1)
        plt.axis([-4, 4, -1.5, 1.5])

    plt.show()
Exemplo n.º 14
0
def decompose(embeddings, n_components, batch_size=None):
    """Apply CPA over input dataset. If batch size is not None, use incremental PCA."""

    embeddings = pd.DataFrame(embeddings).T.reset_index()

    embeddings = embeddings.set_index('index')

    if batch_size is None:
        PCA = decomposition.PCA(n_components=n_components)

    else:
        PCA = decomposition.IncrementalPCA(n_components=n_components,
                                           batch_size=batch_size)

    X = PCA.fit_transform(embeddings)

    output = {}

    for i, label in enumerate(embeddings.index):

        output[label] = X[i]

    return output
Exemplo n.º 15
0
def row_embeddings(df, embeddings, prefix, n_components=2, batch_size=None):
    """Map embeddings on input dataframe and apply PCA on the input dataframe. Apply PCA after mapping
    embeddings.
    """
    e = {}
    for key, value in embeddings.items():
        e[key] = value.tolist()

    X = []

    for column in df.columns:

        if column in prefix:
            embedding = (prefix[column] + df[column].astype(str)).map(e)
        else:
            embedding = df[column].astype(str).map(e)

        X.append(pd.DataFrame(dict(zip(df.index, embedding))).T)

    X = pd.concat(X, axis='columns')

    if batch_size is None:
        PCA = decomposition.PCA(n_components=n_components)
    else:
        PCA = decomposition.IncrementalPCA(n_components=n_components,
                                           batch_size=batch_size)

    X = PCA.fit_transform(X)

    X = pd.DataFrame(X)

    X.columns = [f'dim_{i}' for i in range(len(X.columns))]

    X.index = df.index

    return X
def validate(val_loader, net, criterion, epoch, num_known_classes, num_unknown_classes, hidden, train_args):
    
    model_list = []
    
    # Setting network for evaluation mode.
    net.eval()
    
    count = 0
    
    n_patches = 0
    if dataset_name == 'Vaihingen':
        n_patches = 907 # Vaihingen.
    elif dataset_name == 'Potsdam':
        n_patches = 12393 # 8993 # Potsdam.
        
    np.random.seed(12345)
    
    with torch.no_grad():
        
        ipca_training_time = [0.0 for c in range(num_known_classes)]
        
        # Creating output directory.
        check_mkdir(os.path.join(outp_path, exp_name, 'epoch_' + str(epoch)))
        
        for c in range(num_known_classes):
            
            # Computing PCA models from features.
            model = decomposition.IncrementalPCA(n_components=args['n_components'])
            
            model_list.append(model)
        
        for i, data in enumerate(val_loader):
            
            print('Validation Batch %d/%d' % (i + 1, len(val_loader)))
            sys.stdout.flush()
            
            # Obtaining images, labels and paths for batch.
            inps_batch, labs_batch, true_batch, img_name = data
            
            inps_batch = inps_batch.squeeze()
            labs_batch = labs_batch.squeeze()
            true_batch = true_batch.squeeze()
            
            # Iterating over patches inside batch.
            for j in range(inps_batch.size(0)):
                
                print('    Validation MiniBatch %d/%d' % (j + 1, inps_batch.size(0)))
                sys.stdout.flush()
                
                for k in range(inps_batch.size(1)):
                    
                    inps = inps_batch[j, k].unsqueeze(0)
                    labs = labs_batch[j, k].unsqueeze(0)
                    true = true_batch[j, k].unsqueeze(0)

                    # Casting tensors to cuda.
                    inps, labs, true = inps.cuda(args['device']), labs.cuda(args['device']), true.cuda(args['device'])

                    # Casting to cuda variables.
                    inps = Variable(inps).cuda(args['device'])
                    labs = Variable(labs).cuda(args['device'])
                    true = Variable(true).cuda(args['device'])

                    # Forwarding.
                    if conv_name == 'fcnwideresnet50':
                        outs, classif1, fv2 = net(inps, feat=True)
                    elif conv_name == 'fcndensenet121':
                        outs, classif1, fv2 = net(inps, feat=True)
                    
                    # Computing loss.
                    soft_outs = F.softmax(outs, dim=1)

                    # Obtaining predictions.
                    prds = soft_outs.data.max(1)[1]
                    
                    if conv_name == 'fcnwideresnet50':
                        feat_flat = torch.cat([outs.squeeze(), classif1.squeeze(), fv2.squeeze()], 0)
                    elif conv_name == 'fcndensenet121':
                        feat_flat = torch.cat([outs.squeeze(), classif1.squeeze(), fv2.squeeze()], 0)

                    feat_flat = feat_flat.permute(1, 2, 0).contiguous().view(feat_flat.size(1) * feat_flat.size(2), feat_flat.size(0)).cpu().numpy()
                    prds_flat = prds.cpu().numpy().ravel()
                    true_flat = true.cpu().numpy().ravel()
                    
                    for c in range(num_known_classes):
                        
                        tic = time.time()
                        
                        model_list[c] = partial_fit_ipca_model(model_list[c], feat_flat, true_flat, prds_flat, c)
            
                        toc = time.time()
                        ipca_training_time[c] += (toc - tic)
            
    for c in range(num_known_classes):
        
        print('Time spent fitting model %d: %.2f' % (c, ipca_training_time[c]))
    
    model_full = {'generative': model_list}
    
    # Saving model on disk.
    model_path = os.path.join(outp_path, exp_name, 'model_pca.pkl')
    print('Saving model at "%s"...' % (model_path))
    sys.stdout.flush()
    joblib.dump(model_full, model_path)
    
    return model_full
        f_pca = f_pca.get_data() 
        f_pca = f_all[f_pca==1]
        f_pca = f_pca.reshape(-1)
        f_part = f_all[label_img==1]
        #f = np.array(f)
        #d_std = StandardScaler().fit_transform(d)
        print f_part.shape
        X[c,idx,:f_part.shape[0]] = (f_part-f_part.min())/(f_part.max()-f_part.min())
        #f_pca_std = StandardScaler().fit_transform(f_pca)
        X_pca[c,idx,:f_pca.shape[0]] = (f_pca-f_pca.min())/(f_pca.max()-f_pca.min()) 

### fit patient data ####
#指标pca
print 'pca starting'
#存储pca模型
zhibiao_pca = decomposition.IncrementalPCA(n_components=3)
zhibiao_pca.fit(X_pca.reshape(classnum,-1).T)
#joblib.dump(zhibiao, os.getcwd()+'/modelsave/zhibiao_pca_5000.pkl')
#zhibiao_pca = joblib.load(os.getcwd()+'/modelsave/zhibiao_pca_5000.pkl')
X_outpca = zhibiao_pca.transform(X.reshape(classnum,-1).T)  #中脑或者全脑

###############################################################################
#得出X,Y的数据

X_learning = np.zeros([all_num,data_shape*2])           #(n_samples, n_features) 
for i in range(all_num):
    X_learning[i,:data_shape]=X_outpca[i*data_shape:(i+1)*data_shape,0]
    X_learning[i,data_shape:]=X_outpca[i*data_shape:(i+1)*data_shape,1]

Y_learning = np.zeros(all_num,int)
Y_learning[data_num:] =  np.ones(pat_num,int)
Exemplo n.º 18
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
Exemplo n.º 19
0
    # print(df_na_analyse)

    fill_na(df)
    numerize(df)

    # print(len(df))
    df = df.dropna()
    df = shuffle(df).reset_index()
    # print(len(df))

    # récupérer entrées et sorties
    X, y = get_x_y(df, u'VARIABLE_CIBLE')

    # Analyse en composantes principales
    # X -= X.mean()
    pca = decomposition.IncrementalPCA(n_components=25)
    pca.fit(X)
    X = pca.transform(X)
    X = preprocessing.scale(X)
    # print(X.shape)

    # Créer les datasets de train, validation et test
    X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=.4)
    X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=.5)

    # print(X_train.shape)
    # print(X_valid.shape)
    # print(X_test.shape)

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
Exemplo n.º 20
0
def streaming_PCA(samples, n_components=2, batch_size=100):
    ipca = decomposition.IncrementalPCA(n_components=n_components,
                                        batch_size=batch_size)
    tz.pipe(samples, cur.partition(batch_size), cur.map(np.array),
            cur.map(ipca.partial_fit), tz.last)
    return ipca
Exemplo n.º 21
0
iris = myML.DataPre.load_datasets("iris")  # 使用 scikit-learn 自带的 iris 数据集
X, y = iris.data, iris.target

# 测试 PCA 的用法 (注意:此PCA基于scipy.linalg来实现SVD分解,因此不能应用于实数矩阵,并且无法适用于超大规模数据。)
pca = decomposition.PCA(n_components=None)  # 使用默认的 n_components
pca.fit(X)
print('explained variance ratio : %s' % str(pca.explained_variance_ratio_))

# 绘制经过 PCA 降维到二维之后的样本点
myML.DimReduce.plotparam_decomposition(X,
                                       y,
                                       "decomposition.PCA()",
                                       n_components=[2])

# 超大规模数据降维 IncrementalPCA
pca = decomposition.IncrementalPCA(n_components=None)  # 使用默认的 n_components
pca.fit(X)
print('explained variance ratio : %s' % str(pca.explained_variance_ratio_))

# ---核化线性降维 KernelPCA
from sklearn import decomposition
iris = myML.DataPre.load_datasets("iris")  # 使用 scikit-learn 自带的 iris 数据集
X, y = iris.data, iris.target

# 测试 KernelPCA 的用法
kernels = ['linear', 'poly', 'rbf']
for kernel in kernels:
    kpca = decomposition.KernelPCA(n_components=None,
                                   kernel=kernel).fit(X)  # 依次测试四种核函数
    print('kernel=%s --> lambdas: %s' % (kernel, kpca.lambdas_))
Exemplo n.º 22
0
def Bassan(wavenumbers, App, m0, n_components=8, iterations=1, w_regions=None):
    """
    Correct scattered spectra using Bassan's algorithm.
    :param wavenumbers: array of wavenumbers
    :param App: apparent spectrum
    :param m0: reference spectrum
    :param n_components: number of principal components to be calculated for the extinction matrix
    :param iterations: number of iterations of the algorithm
    :param w_regions: the regions to be taken into account for the fitting
    :return: corrected apparent spectrum
    """
    # Copy the input data
    wn = np.copy(wavenumbers)
    A_app = np.copy(App)
    m_0 = np.copy(m0)
    ii = np.argsort(wn)  # Sort the wavenumbers
    # Apply the sorting to the input variables
    wn = wn[ii]
    A_app = A_app[ii]
    m_0 = m_0[ii]

    # Define the weighted regions:
    if w_regions is not None:
        m_0 = correct_reference(
            np.copy(m_0), wn, a, d,
            w_regions)  # Correct the reference spectrum as in Kohler method
        w_indexes = []
        # Get the indexes of the regions to be taken into account
        for pair in w_regions:
            min_pair = min(pair)
            max_pair = max(pair)
            ii1 = find_nearest_number_index(wn, min_pair)
            ii2 = find_nearest_number_index(wn, max_pair)
            w_indexes.extend(np.arange(ii1, ii2))
        # Take the weighted regions of wavenumbers, apparent and reference spectrum
        wn_w = np.copy(wn[w_indexes])
        A_app_w = np.copy(A_app[w_indexes])
        m_0_w = np.copy(m_0[w_indexes])

    n_loadings = 10  # Number of values to be computed for each parameter (a, b, d)
    a = np.linspace(1.1, 1.5, n_loadings)  # Average refractive index
    d = np.linspace(2.0, 8.0, n_loadings) * 1.0e-4  # Cell diameter
    Q = np.zeros((n_loadings**3, len(wn)))  # Initialize the extinction matrix
    m_n = np.copy(
        m_0
    )  # Initialize the reference spectrum, that will be updated after each iteration
    for iteration in range(iterations):
        # Compute the scaled real part of the refractive index by Kramers-Kronig transform:
        nkk = -1.0 * np.imag(hilbert(m_n))
        # Build the extinction matrix
        n_row = 0
        for i in range(n_loadings):
            b = np.linspace(0.0, a[i] - 1.0,
                            10)  # Range of amplification factors of nkk
            for j in range(n_loadings):
                for k in range(n_loadings):
                    n = a[i] + b[j] * nkk  # Compute the real refractive index
                    alpha = 2.0 * np.pi * d[k] * (n - 1.0)
                    rho = alpha * wn
                    #  Compute the extinction coefficients for each combination of a, b and d:
                    Q[n_row] = 2.0 - np.divide(4.0, rho) * np.sin(rho) + \
                               np.divide(4.0, rho ** 2.0) * (1.0 - np.cos(rho))
                    n_row += 1

        # Orthogonalization of th extinction matrix with respect to the reference spectrum:
        for i in range(n_loadings**3):
            Q[i] -= np.dot(Q[i], m_0) / np.linalg.norm(m_0)**2.0 * m_0

        # Perform PCA of the extinction matrix
        pca = skl_decomposition.IncrementalPCA(n_components=n_components)
        pca.fit(Q)
        p_i = pca.components_  # Get the principal components

        if w_regions is None:  # If all regions have to be taken into account:

            def min_fun(x):
                """
                Function to be minimized for the fitting
                :param x: fitting parameters (offset, baseline, reference's linear factor, PCA scores)
                :return: squared norm of the difference between the apparent spectrum and its fitting
                """
                cc, mm, hh, g = x[0], x[1], x[2], x[3:]
                return np.linalg.norm(A_app -
                                      apparent_spectrum_fit_function_Bassan(
                                          wn, m_0, p_i, cc, mm, hh, g))**2.0
        else:  # If only the specified regions have to be taken into account:
            # Take the indexes of the specified regions
            w_indexes = []
            for pair in w_regions:
                min_pair = min(pair)
                max_pair = max(pair)
                ii1 = find_nearest_number_index(wn, min_pair)
                ii2 = find_nearest_number_index(wn, max_pair)
                w_indexes.extend(np.arange(ii1, ii2))
            p_i_w = np.copy(
                p_i[:, w_indexes]
            )  # Get the principal components of the extinction matrix at the

            # specified regions

            def min_fun(x):
                """
                Function to be minimized for the fitting
                :param x: fitting parameters (offset, baseline, reference's linear factor, PCA scores)
                :return: squared norm of the difference between the apparent spectrum and its fitting
                """
                cc, mm, hh, g = x[0], x[1], x[2], x[3:]
                return np.linalg.norm(
                    A_app_w - apparent_spectrum_fit_function_Bassan(
                        wn_w, m_0_w, p_i_w, cc, mm, hh, g))**2.0

        p0 = np.append([1.0, 0.0005, 0.9],
                       np.ones(n_components))  # Initial guess for the fitting
        res = scipy.optimize.minimize(min_fun, p0,
                                      method='Powell')  # Perform the fitting

        # print(res)  # Print the result of the minimization
        # assert(res.success) # Raise AssertionError if res.success == False

        c, m, h, g_i = res.x[0], res.x[1], res.x[2], res.x[
            3:]  # Take the fitted parameters

        Z_corr = (A_app - c - m * wn -
                  np.dot(g_i, p_i)) / h  # Apply the correction

        m_n = np.copy(
            Z_corr
        )  # Take the corrected spectrum as the reference for the next iteration

    return np.copy(
        Z_corr[::-1]
    )  # Return the corrected spectrum in inverted order for compatibility
Exemplo n.º 23
0
def Konevskikh(wavenumbers, App, m0, n_components=8, iterations=1):
    """
    Correct scattered spectra using Konevskikh algorithm
    :param wavenumbers: array of wavenumbers
    :param App: apparent spectrum
    :param m0: reference spectrum
    :param n_components: number of components
    :param iterations: number of iterations
    :return: corrected spectrum
    """
    # Copy the input variables
    wn = np.copy(wavenumbers)
    A_app = np.copy(App)
    m_0 = np.copy(m0)
    ii = np.argsort(wn)  # Sort the wavenumbers
    wn = wn[ii]
    A_app = A_app[ii]
    m_0 = m_0[ii]

    # Initialize parameters range:
    alpha_0, gamma = np.array([
        np.logspace(np.log10(0.1), np.log10(2.2), num=10) * 4.0e-4 * np.pi,
        np.logspace(np.log10(0.05e4), np.log10(0.05e5), num=10) * 1.0e-2
    ])
    p0 = np.ones(2 + n_components)
    Q_ext = np.zeros(
        (len(alpha_0) * len(gamma), len(wn)))  # Initialize extinction matrix

    m_n = np.copy(m_0)  # Copy the reference spectrum
    for n_iteration in range(iterations):
        ns_im = np.divide(
            m_n, wn)  # Compute the imaginary part of the refractive index
        # Compute the real part of the refractive index by Kramers-Kronig transform
        ns_re = -1.0 * np.imag(hilbert(ns_im))

        # Compute the extinction matrix
        n_index = 0
        for i in range(len(alpha_0)):
            for j in range(len(gamma)):
                for k in range(len(A_app)):
                    rho = alpha_0[i] * (1.0 + gamma[j] * ns_re[k]) * wn[k]
                    beta = np.arctan(ns_im[k] / (1.0 / gamma[j] + ns_re[k]))
                    Q_ext[n_index][k] = 2.0 - 4.0 * np.exp(-1.0 * rho * np.tan(beta)) * (np.cos(beta) / rho) * \
                        np.sin(rho - beta) - 4.0 * np.exp(-1.0 * rho * np.tan(beta)) * (np.cos(beta) / rho) ** 2.0 * \
                        np.cos(rho - 2.0 * beta) + 4.0 * (np.cos(beta) / rho) ** 2.0 * np.cos(2.0 * beta)
                    # TODO: rewrite this in a simpler way

                n_index += 1

        # Orthogonalize the extinction matrix with respect to the reference:
        for i in range(n_index):
            Q_ext[i][:] -= np.dot(Q_ext[i][:],
                                  m_0) / np.linalg.norm(m_0)**2.0 * m_0
        # Q_ext = GramSchmidt(np.copy(Q_ext))  # Apply Gram-Schmidt othogonalization to Q_ext (don't uncomment this)

        # Compute PCA of the extinction matrix
        pca = skl_decomposition.IncrementalPCA(n_components=n_components)
        pca.fit(Q_ext)
        p_i = pca.components_  # Get the principal components

        def min_fun(x):
            bb, cc, g = x[0], x[1], x[2:]
            return np.linalg.norm(
                A_app -
                apparent_spectrum_fit_function(wn, m_0, p_i, bb, cc, g))**2.0

        res = scipy.optimize.minimize(min_fun, p0, method='Powell')
        # print(res)  # Print the minimization results
        # assert(res.success) # Raise AssertionError if res.success == False

        b, c, g_i = res.x[0], res.x[1], res.x[2:]  # Get the fitted parameters

        Z_corr = (A_app - c - np.dot(g_i, p_i)) / b  # Apply the correction

        m_n = np.copy(Z_corr)  # Update te reference with the correction

    return Z_corr[::-1]  # Return the corrected spectrum
Exemplo n.º 24
0
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
X_projected = rp.fit_transform(X)
plot_embedding(X_projected, "Random Projection of the digits")

# ----------------------------------------------------------------------
# Projection on to the first 2 principal components
#PCA substracts de mean, unlike SVD. If your features are least sensitive (informative)
#towards the mean of the distribution, then it makes sense to subtract the mean.
#If the features are most sensitive towards the high values, then subtracting the mean does not make sense.

print("Computing PCA projection")
t0 = time()
X_svd = decomposition.TruncatedSVD(n_components=2).fit_transform(X)
X_pca = decomposition.PCA(n_components=2).fit_transform(
    X)  #center but doest not scale data before aplying SVD
X_ipca = decomposition.IncrementalPCA(n_components=2).fit_transform(
    X)  #for large dataset that do not fit in memory
plot_embedding(X_svd, "Singular value decomposition projection of the digits ")
plot_embedding(X_pca, "Principal Components projection of the digits ")
plot_embedding(X_ipca,
               "Incremental Principal Components projection of the digits ")

# ----------------------------------------------------------------------
# Projection on to the first 2 linear discriminant components

print("Computing Linear Discriminant Analysis projection")
X2 = X.copy()
X2.flat[::X.shape[1] +
        1] += 0.01  # Make X invertible. Creo que no es necesario
t0 = time()
X_lda = discriminant_analysis.LinearDiscriminantAnalysis(
    n_components=2).fit_transform(X2, y)
Exemplo n.º 25
0
def normaliseDataset(path_to_raw_trainingset, path_to_save_loc, feature_type, chunksize):
	"""
	PCA + Normalise a provided dataset using pandas and SKLearn normalisation for rapid processing
	that scales linearly with input size.

	Pickles the normalisation object for future external test sets.

	--args
	collection_iterable (iterable): pandas DF object containing an index and column names
	feature_type (str): describes which feature type is being processed

	--returns
	normalised_collection (pandas dataframe): normalised training set containing 
	an index and column names
	labels (list of series): vectors with target labels 
	collection.columns (list of strings): list of column names, including labels 
	collection_features.index (list of strings): list of index names, i.e. perturbations

	"""
	scaler = preprocessing.StandardScaler()

	if feature_type == "1DCNN": # all parameters here were found manually:
		n_components = 200
		# print("This function takes ~10s to complete on 15K datapoints.\n")
	elif feature_type == "MOLPROPS":
		n_components = 750
		# print("This function takes ~10m to complete on 15K datapoints.\n")
	elif feature_type == "PFP":
		n_components = 200
		# print("This function takes ~10s to complete on 15K datapoints.\n")

	pca = decomposition.IncrementalPCA(n_components=n_components)
	

	###########################################################################################
	# we need to perform incremental standardization because of the large dataset:
	print("Making first pass (partial fitting)..")
	collection_iterable = readHDF5Iterable(
										path_to_raw_trainingset, 
										chunksize=chunksize)


	for collection in collection_iterable:

		# omit labels from normalisation:
		labels, collection_features = dropLabels(collection, feature_type)

		# fit the normalisation:
		scaler.partial_fit(collection_features)
		

	###########################################################################################
	# Now with fully updated means + variances, make a second pass through the iterable and transform:
	print("Making second pass (partial transform + partial PCA fit)..")
	collection_iterable = readHDF5Iterable(
									path_to_raw_trainingset, 
									chunksize=chunksize)

	for collection in collection_iterable:	
		# omit labels from normalisation:
		labels, collection_features = dropLabels(collection, feature_type)

		# transform:
		normalised_collection = pd.DataFrame(scaler.transform(collection_features))

		# now fit an incremental PCA to this chunk:
		pca.partial_fit(normalised_collection)
		
	
	# # uncomment to figure out ~ how many dims to retain for 95% VE.
	# # can't use n_components=0.95 in our case because we process in chunks :(

	# ve_ratios = pca.explained_variance_ratio_
	# ve_counter = 0
	# ve_cumulative = 0
	# for ve in ve_ratios:
	# 	if not ve_cumulative >= 0.95:			
	# 		ve_cumulative += ve
	# 		ve_counter += 1
	# print("Keep", ve_counter, "to retain", ve_cumulative*100, "of variance explained.")

	###########################################################################################
	# now with the completed PCA object; go over iterable one last time;
	# apply normalisation and transform by PCA and save to individual files:
	print("Making third pass (normalise and PCA transform)..")
	collection_iterable = readHDF5Iterable(
									path_to_raw_trainingset, 
									chunksize=chunksize)
	
	if os.path.exists(path_to_save_loc+feature_type+"/data.h5"):
		os.remove(path_to_save_loc+feature_type+"/data.h5")
	store = pd.HDFStore(path_to_save_loc+feature_type+"/data.h5")


	for collection in collection_iterable:
		
		# this is our final transform; save perturbation names:
		perturbation_indeces = collection.index

		# omit labels from normalisation:
		labels, collection_features = dropLabels(collection, feature_type)

		# normalise transform:
		normalised_collection = pd.DataFrame(scaler.transform(collection_features))

		# PCA transform to finish preprocessing:
		processed_collection = pca.transform(normalised_collection)

		# prettify the np matrix back into DF and append to HDF:
		num_PCA_dims = len(processed_collection[0])
		PCA_column_headers = [ "PC"+str(dim) for dim in range(num_PCA_dims)]

		pca_data_df = pd.DataFrame(
									processed_collection, 
									index=perturbation_indeces, 
									columns=PCA_column_headers
									)

		complete_preprocessed_df = pd.concat([pca_data_df, labels], 
												axis=1, sort=False)

		store.append(
					path_to_save_loc+feature_type+"/data.h5", 
					complete_preprocessed_df,
					format="table",
					index=False,
					)
	store.close()
	# finally, save both the standardscaler and PCA object for transforming test datasets:
	pickle.dump(scaler, open(path_to_save_loc+"PICKLES/"+feature_type+"_scaler.pkl","wb"))
	pickle.dump(pca, open(path_to_save_loc+"PICKLES/"+feature_type+"_pca.pkl","wb"))
Exemplo n.º 26
0
def correct_reference(m, wn, a, d, w_regions):
    """
    Correct reference spectrum as in Kohler's method
    :param m: reference spectrum
    :param wn: wavenumbers
    :param a: Average refractive index range
    :param d: Cell diameter range
    :param w_regions: Weighted regions
    :return: corrected reference spectrum
    """
    n_components = 6  # Set the number of principal components

    # Copy the input variables
    m = np.copy(m)
    wn = np.copy(wn)

    # Compute the alpha range:
    alpha = 4.0 * np.pi * 0.5 * np.linspace(
        np.min(d) * (np.min(a) - 1.0),
        np.max(d) * (np.max(a) - 1.0), 150)

    p0 = np.ones(1 + n_components)  # Initial guess for the fitting

    # Compute extinction matrix
    Q_ext = np.zeros((np.size(alpha), np.size(wn)))
    for i in range(np.size(alpha)):
        Q_ext[i][:] = Q_ext_kohler(wn, alpha=alpha[i])

    # Perform PCA to Q_ext
    pca = skl_decomposition.IncrementalPCA(n_components=n_components)
    pca.fit(Q_ext)
    p_i = pca.components_  # Get the principal components of the extinction matrix

    # Get the weighted regions of the wavenumbers, the reference spectrum and the principal components
    w_indexes = []
    for pair in w_regions:
        min_pair = min(pair)
        max_pair = max(pair)
        ii1 = find_nearest_number_index(wn, min_pair)
        ii2 = find_nearest_number_index(wn, max_pair)
        w_indexes.extend(np.arange(ii1, ii2))
    wn_w = np.copy(wn[w_indexes])
    m_w = np.copy(m[w_indexes])
    p_i_w = np.copy(p_i[:, w_indexes])

    def min_fun(x):
        """
        Function to be minimized for the fitting
        :param x: offset and PCA scores
        :return: difference between the spectrum and its fitting
        """
        cc, g = x[0], x[1:]
        # Return the squared norm of the difference between the reference spectrum and its fitting:
        return np.linalg.norm(
            m_w - reference_spectrum_fit_function(wn_w, p_i_w, cc, g))**2.0

    # Perform the minimization using Powell method
    res = scipy.optimize.minimize(min_fun, p0, bounds=None, method='Powell')

    c, g_i = res.x[0], res.x[1:]  # Obtain the fitted parameters

    # Apply the correction:
    m_corr = np.zeros(np.shape(m))
    for i in range(len(wn)):
        sum1 = 0
        for j in range(len(g_i)):
            sum1 += g_i[j] * p_i[j][i]
        m_corr[i] = (m[i] - c - sum1)

    return m_corr  # Return the corrected spectrum
Exemplo n.º 27
0
print meta_df['pert_id'].nunique()
print meta_df['perturbation'].nunique()
# meta_df.to_csv('data/metadata.tsv', sep='\t')

meta_df.to_csv('data/metadata-sig-only.tsv', sep='\t')

mask_shared = np.in1d(sig_ids, meta_df.index.tolist())

mat = mat[mask_shared, :]
# mat = pairwise_distances(mat, metric='cosine')
# z-score norm
print mat.shape

batch_size = 400
scl = preprocessing.StandardScaler()
ipca = decomposition.IncrementalPCA(n_components=3, batch_size=None)

n_batchs = int(math.ceil(mat.shape[0] / float(batch_size)))

# for i in range(n_batchs):
# 	start_idx = i * batch_size
# 	end_idx = (i+1) * batch_size

# 	scl.partial_fit(mat[start_idx:end_idx])

for i in range(n_batchs):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    # scaled_sub_mat = scl.transform(mat[start_idx:end_idx])
    # ipca.partial_fit(scaled_sub_mat)
    ipca.partial_fit(mat[start_idx:end_idx])
Exemplo n.º 28
0
def find_components(input_data, n_components=2, method='pca', **kwargs):
    '''
    Extract components from an array of data
    
    input_data: np.array
        The input data matrix
    
    n_comps : int
        The number of components to extract
    
    method : str
        The dimensionality reduction technique to use
        
    kwargs : optional arguments to pass to the construction of the estimator
        
    Note: this function is basically a wrapper for a bunch of the 
    standard estimators found in sklearn. Please refer to the sklearn documentation
    for the keyword arguments to pass to the various estimators.
    http://scikit-learn.org/stable/modules/decomposition.html
    
    Examples
    --------
    
    >>>components = find_components(data,method='k-means',tol=1e-3, batch_size=100, max_iter=50)
    >>>plot(components.T)
    
    >>>components = find_components(copy(all_traces.T),method='incremental pca',batch_size=100)
    >>>plot(components.T)
    
    DEV:
     Automatically compute the batch sizes for incremental and minibatch PCA
    "The computational overhead of each SVD is O(batch_size * n_features ** 2), 
    but only 2 * batch_size samples remain in memory at a time. There will be n_samples / batch_size SVD 
    computations to get the principal components, versus 1 large SVD of complexity O(n_samples * n_features ** 2) 
    for PCA."
    
    Issues
    ------

    LDA returns components that are not orthogonal; need to apply Gram-Schmidt
    
    Kernel PCA is currently broken, also LDA is erratic
    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA
    

    '''

    if not has_sklearn:
        warnings.warn(
            'scikit-learn not found. This function will not work correctly.')
        return None

    n_samples, n_features = input_data.shape
    rng = RandomState(0)

    if n_samples < n_features:
        warnings.warn(
            'More features than samples; assuming input data matrix was transposed.'
        )
        input_data = input_data.T
        n_samples, n_features = n_features, n_samples

    if method in [
            'pca', 'ica', 'k-means', 'incremental pca', 'kernel pca',
            'random pca'
    ]:
        data = input_data - input_data.mean(axis=0)
    else:
        data = input_data

    if method == 'pca':
        estimator = decomposition.PCA(n_components=n_components, **kwargs)
    elif method == 'k-means':
        estimator = MiniBatchKMeans(n_clusters=n_components,
                                    random_state=rng,
                                    **kwargs)
    elif method == 'ica':
        estimator = decomposition.FastICA(n_components=n_components,
                                          whiten=True,
                                          **kwargs)
    elif method == 'incremental pca':
        estimator = decomposition.IncrementalPCA(n_components=n_components,
                                                 whiten=True)
    elif method == 'svd':
        estimator = decomposition.TruncatedSVD(n_components=n_components,
                                               random_state=rng,
                                               **kwargs)
    elif method == 'kernel pca':
        estimator = decomposition.KernelPCA(n_components=n_components,
                                            **kwargs)
    elif method == 'lda':
        estimator = decomposition.LatentDirichletAllocation(
            n_topics=n_components, random_state=rng, **kwargs)
        data = data + abs(min(ravel(data)))
    elif method == 'random pca':
        estimator = decomposition.RandomizedPCA(n_components=n_components,
                                                whiten=True,
                                                **kwargs)
    else:
        warnings.warn('Unknown \'method\' argument given; falling back to PCA')
        estimator = decomposition.PCA(n_components=n_components)

    estimator.fit(data)

    if hasattr(estimator, 'cluster_centers_'):
        components = estimator.cluster_centers_
    else:
        components = estimator.components_

    return components
Exemplo n.º 29
0
def Kohler(wavenumbers, App, m0, n_components=8):
    """
    Correct scattered spectra using Kohler's algorithm
    :param wavenumbers: array of wavenumbers
    :param App: apparent spectrum
    :param m0: reference spectrum
    :param n_components: number of principal components to be calculated 
    :return: corrected data
    """
    # Make copies of all input data:
    wn = np.copy(wavenumbers)
    A_app = np.copy(App)
    m_0 = np.copy(m0)
    ii = np.argsort(wn)  # Sort the wavenumbers from smallest to largest
    # Sort all the input variables accordingly
    wn = wn[ii]
    A_app = A_app[ii]
    m_0 = m_0[ii]

    # Initialize the alpha parameter:
    alpha = np.linspace(
        3.14, 49.95, 150) * 1.0e-4  # alpha = 2 * pi * d * (n - 1) * wavenumber
    p0 = np.ones(2 +
                 n_components)  # Initialize the initial guess for the fitting

    # # Initialize the extinction matrix:
    Q_ext = np.zeros((np.size(alpha), np.size(wn)))
    for i in range(np.size(alpha)):
        Q_ext[i][:] = Q_ext_kohler(wn, alpha=alpha[i])

    # Perform PCA of Q_ext:
    pca = skl_decomposition.IncrementalPCA(n_components=n_components)
    pca.fit(Q_ext)
    p_i = pca.components_  # Extract the principal components

    # print(np.sum(pca.explained_variance_ratio_)*100)  # Print th explained variance ratio in percentage

    def min_fun(x):
        """
        Function to be minimized by the fitting
        :param x: array containing the reference linear factor, the offset, and the PCA scores 
        :return: function to be minimized
        """
        bb, cc, g = x[0], x[1], x[2:]
        # Return the squared norm of the difference between the apparent spectrum and the fit
        return np.linalg.norm(
            A_app -
            apparent_spectrum_fit_function(wn, m_0, p_i, bb, cc, g))**2.0

    # Minimize the function using Powell method
    res = scipy.optimize.minimize(min_fun, p0, bounds=None, method='Powell')
    # print(res)  # Print the minimization result
    # assert(res.success) # Raise AssertionError if res.success == False

    b, c, g_i = res.x[0], res.x[1], res.x[2:]  # Obtain the fitted parameters

    # Apply the correction to the apparent spectrum
    Z_corr = np.zeros(np.shape(m_0))
    for i in range(len(wavenumbers)):
        sum1 = 0
        for j in range(len(g_i)):
            sum1 += g_i[j] * p_i[j][i]
        Z_corr[i] = (A_app[i] - c - sum1) / b

    return Z_corr[::
                  -1]  # Return the correction in reverse order for compatibility
Exemplo n.º 30
0
def Kohler_zero(wavenumbers, App, w_regions, n_components=8):
    """
    Correct scattered spectra using Kohler's algorithm
    :param wavenumbers: array of wavenumbers
    :param App: apparent spectrum
    :param m0: reference spectrum
    :param n_components: number of principal components to be calculated
    :return: corrected data
    """
    # Make copies of all input data:
    wn = np.copy(wavenumbers)
    A_app = np.copy(App)
    m_0 = np.zeros(len(wn))
    ii = np.argsort(wn)  # Sort the wavenumbers from smallest to largest
    # Sort all the input variables accordingly
    wn = wn[ii]
    A_app = A_app[ii]
    m_0 = m_0[ii]

    # Initialize the alpha parameter:
    alpha = np.linspace(
        1.25, 49.95, 150) * 1.0e-4  # alpha = 2 * pi * d * (n - 1) * wavenumber
    p0 = np.ones(2 +
                 n_components)  # Initialize the initial guess for the fitting

    # # Initialize the extinction matrix:
    Q_ext = np.zeros((np.size(alpha), np.size(wn)))
    for i in range(np.size(alpha)):
        Q_ext[i][:] = Q_ext_kohler(wn, alpha=alpha[i])

    # Perform PCA of Q_ext:
    pca = skl_decomposition.IncrementalPCA(n_components=n_components)
    pca.fit(Q_ext)
    p_i = pca.components_  # Extract the principal components

    # print(np.sum(pca.explained_variance_ratio_)*100)  # Print th explained variance ratio in percentage
    w_indexes = []
    for pair in w_regions:
        min_pair = min(pair)
        max_pair = max(pair)
        ii1 = find_nearest_number_index(wn, min_pair)
        ii2 = find_nearest_number_index(wn, max_pair)
        w_indexes.extend(np.arange(ii1, ii2))
    wn_w = np.copy(wn[w_indexes])
    A_app_w = np.copy(A_app[w_indexes])
    m_w = np.copy(m_0[w_indexes])
    p_i_w = np.copy(p_i[:, w_indexes])

    def min_fun(x):
        """
        Function to be minimized by the fitting
        :param x: array containing the reference linear factor, the offset, and the PCA scores
        :return: function to be minimized
        """
        bb, cc, g = x[0], x[1], x[2:]
        # Return the squared norm of the difference between the apparent spectrum and the fit
        return np.linalg.norm(
            A_app_w -
            apparent_spectrum_fit_function(wn_w, m_w, p_i_w, bb, cc, g))**2.0

    # Minimize the function using Powell method
    res = scipy.optimize.minimize(min_fun, p0, bounds=None, method='Powell')
    # print(res)  # Print the minimization result
    # assert(res.success) # Raise AssertionError if res.success == False

    b, c, g_i = res.x[0], res.x[1], res.x[2:]  # Obtain the fitted parameters

    # Apply the correction to the apparent spectrum
    Z_corr = (A_app - c - np.dot(g_i, p_i))  # Apply the correction
    base = np.dot(g_i, p_i)

    return Z_corr, base