Exemplo n.º 1
0
def plotit(what):
    if what == "DHI":
        plotme = np.stack((Y_C, DHI), axis=1)
    else:
        plotme = np.stack((Y_C, D13C), axis=1)
    plotmeX, plotmeY = dropna(plotme, country_codes)
    plotmeX, plotmeY = drop_lowNs(plotmeX, plotmeY, threshold=5)
    category_frequencies(plotmeY)
    plot(plotmeX, plotmeY, axlabels=["Y", what])
Exemplo n.º 2
0
def xperiment():
    df = pull_merged_data(feature=FEATURE).dropna()
    X, Y = df[PARAM].as_matrix(), df[FEATURE].as_matrix()
    inspection.category_frequencies(Y)
    Y, X = drop_lowNs(10, Y, X)
    inspection.correlation(X, names=PARAM)
    pairwise_T2(X, Y, dumproot=projectroot, xpid=f"PairwiseT2_{FEATURE}.xlsx")
    F, p = manova(X, Y)
    print("-"*50)
    lda = LDA(n_components=2).fit(X, Y)  # type: LDA
    smexvar = lda.explained_variance_ratio_
    scat = scatter.Scatter2D(lda.transform(X), Y, title=f"LDA ({smexvar.sum():.2%})\nMANOVA: F = {F:.4f}, p = {p:.4f}",
                             axlabels=[f"Latent0{i} ({ev:.2%})" for i, ev in enumerate(smexvar, start=1)])
    is_many = len(np.unique(Y)) > 5
    scat.split_scatter(legend=not is_many, show=True, center=is_many, label=is_many)
Exemplo n.º 3
0
Arquivo: area51.py Projeto: csxeba/EBH
def inspect_classes():
    from csxdata.stats import normaltest, inspection
    from csxdata.visual.histogram import fullplot
    names = []
    for l in "YP":
        for i in range(10):
            names.append(l + str(i))

    X, Y = load_dataset(as_matrix=False, as_string=True)

    inspection.category_frequencies(Y)
    inspection.correlation(X, names=names)
    normaltest.full(X, names=names)
    for name, column in zip(names, X.T):
        fullplot(column, name)
Exemplo n.º 4
0
def main():
    df = pull_merged_data("MEGYE")
    print()
    inspection.category_frequencies(df["MEGYE"])
    normaltest.full(df[PARAM], names=PARAM)
    # inspection.correlation(df[PARAM], names=PARAM)
    print()

    fig, axarr = plt.subplots(3, 2, figsize=(5, 10))

    for param, (histax, probax) in zip(PARAM, axarr):
        x = df[param]
        print(f"SKEW of {param}: {x.skew()}")
        histogram.Histogram(x, ax=histax).plot(axtitle=f"{param} histogram")
        histogram.NormProb(x,
                           ax=probax).plot(axtitle=f"{param} Norm. prob. plot")
    plt.suptitle("Normality test on the merged fruit-wine datasets")
    plt.tight_layout()
    plt.show()
Exemplo n.º 5
0
def xperiment(transform, ndim):
    X, Y = read_datasets(ycol="YEAR", dropthresh=10)
    category_frequencies(Y)
    F, p = manova(X, Y)
    X = standardize(X)
    model = get_transformator(ndim, transform)
    lX = model.fit_transform(X, Y)

    expvar = model.explained_variance_ratio_[:ndim]
    plottitle = f"{transform.upper()} ({sum(expvar):.2%})\nMANOVA F = {F:.4f}, p = {p:.4f}"
    axlabels = [f"Latent0{i+1} ({expvar[i]:.2%})" for i in range(ndim)]

    if ndim == 2:
        scat = Scatter2D(lX, Y, title=plottitle, axlabels=axlabels)
    elif ndim == 3:
        scat = Scatter3D(lX, Y, title=plottitle, axlabels=axlabels)
    else:
        raise ValueError(f"Unsupported dimensionality: {ndim}")

    scat.split_scatter(legend=True, show=True)
Exemplo n.º 6
0
import numpy as np

from csxdata.stats.inspection import category_frequencies
from csxdata.utilities.highlevel import plot
from csxdata.utilities.parser import parse_csv
from csxdata.utilities.vectorop import drop_lowNs, dropna

from SciProjects.sophie import projectroot

X, Y, head = parse_csv(projectroot + "01GEO.csv",
                       indeps=4, headers=1, decimal=True)

y_coord = Y[:, -1].astype(float)
categ = Y[:, 0]
DHI, D13C = X.T
plotme = np.stack((DHI, D13C, y_coord), axis=1)
plotme, categ = dropna(plotme, categ)
category_frequencies(categ)
plot(plotme, axlabels=["DHI", "D13C", "Y"])
Exemplo n.º 7
0
from SciProjects.sophie import projectroot

from csxdata.utilities.parser import parse_csv
from csxdata.utilities.vectorop import dropna
from csxdata.stats.inspection import category_frequencies, correlation
from csxdata.stats.normaltest import full

X, Y, head = parse_csv(projectroot + "01GEO.csv",
                       indeps=2,
                       headers=1,
                       decimal=True)

category_frequencies(Y)
X, Y = dropna(X, Y)
correlation(X, ["X", "Y", "DH1", "DH2"])
full(X)
Exemplo n.º 8
0
def category_frequencies():
    from csxdata.stats.inspection import category_frequencies
    category_frequencies(Y)
Exemplo n.º 9
0
import numpy as np

from csxdata.utilities.highlevel import plot
from csxdata.stats.inspection import category_frequencies

from SciProjects.zsindstat.util import pull_data, axlab_latex


def filter_out(X, Y, unwanted):
    arg = np.argwhere(Y != unwanted).ravel()
    return X[arg], Y[arg]


frame = pull_data("FRUIT", filterby="FAM", selection="Pru")

category_frequencies(frame.indeps)

plot(frame.data, frame.indeps, ellipse_sigma=2, axlabels=axlab_latex)
Exemplo n.º 10
0
def plotit3d():
    plotme = np.stack((Y_C, DHI, D13C), axis=1)
    plotmeX, plotmeY = dropna(plotme, country_codes)
    category_frequencies(plotmeY)
    plot(plotmeX, plotmeY, axlabels=["Y", "DHI", "D13C"])
Exemplo n.º 11
0
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from SciProjects.fruits.xperiment05.util import pull_data

from csxdata.visual import scatter
from csxdata.utilities.vectorop import drop_lowNs
from csxdata.stats import manova, inspection

FEATURE = "EV"

df = pull_data(FEATURE)
X = df[["DH1", "DH2", "D13C"]].as_matrix()
Y = df[FEATURE].as_matrix()

Y, X = drop_lowNs(15, Y, X)

inspection.category_frequencies(Y)

lX = LDA(n_components=2).fit_transform(X, Y)
title = "LDA\nMANOVA F = {:.4f}, {:.4f}".format(*manova(X, Y))
scat = scatter.Scatter2D(lX,
                         Y,
                         title=title,
                         axlabels=[f"Latent0{i}" for i in range(1, 3)])
scat.split_scatter(show=True)
Exemplo n.º 12
0
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt

from csxdata.visual import Plotter2D
from csxdata.utilities.vectorop import dropna
from csxdata.stats.inspection import category_frequencies

from SciProjects.sophie import pull_data, axtitles

X_C, Y_C, DHI, D13C, CCode = pull_data("04GEO_eu.csv")

DHI, Y_C, CCode = dropna(DHI, Y_C, CCode)
category_frequencies(CCode)
R, p = stats.spearmanr(DHI, Y_C)

line = np.polyfit(Y_C, DHI, 1)
line = np.poly1d(line)

ttl = (
    "Korreláció $(D/H)_I$ és az egyenlítőtől való távolság között Európában",
    f"Spearman-korreláció: R = {R:.2f}, p = {p:.2f}, {('nem' if p > 0.05 else '')}szignifikáns"
)
axttl = ["Egyenlítőtől való távolság", axtitles["DHI"]]

plotter = Plotter2D(plt.figure(),
                    np.stack((Y_C, DHI), axis=1),
                    CCode,
                    title="\n".join(ttl),
                    axlabels=axttl)
plotter.split_scatter(center=True, sigma=2, alpha=0.5)
Exemplo n.º 13
0
from sklearn.feature_selection import f_oneway

from csxdata.stats import inspection
from csxdata.utilities.highlevel import plot, transform

from SciProjects.zsindstat.util import pull_data, axlab_latex

frame = pull_data("YEAR", filterby="FRUIT", selection="meggy")

inspection.category_frequencies(frame.indeps)
X = frame.learning
plot(X, frame.indeps, axlabels=axlab_latex, ellipse_sigma=2)
tX = transform(X, factors=1, get_model=False, method="lda", y=frame.indeps)

print("F: {}, pval: {}".format(*f_classif(tX, frame.indeps)))