Пример #1
0
def pcaPlot(filename):
    subdir = filename.split('/')[-2]
    columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig(
        subdir)
    df = utilities.readDataFile(filename)
    df = utilities.getDataWithTimeIndex(df)
    df = df.dropna()

    traintime, testtime, validtime = timestamps

    if relevantColumns is not None:
        df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames])

    start_train, end_train = traintime
    start_test, end_test = testtime
    start_valid, end_valid = validtime

    df_train = utilities.getDataByTimeframe(df, start_train, end_train)
    train_vals = df_train.values
    #train_vals = df.values

    sc = StandardScaler()
    train_vals = sc.fit_transform(train_vals)

    numberOfComponents = 2

    pca = decomposition.PCA(n_components=numberOfComponents)
    pca.fit(train_vals)

    x = df.values
    x = sc.transform(x)
    x = pca.transform(x)

    df_pca = pd.DataFrame(data=x, index=df.index, columns=['pca1', 'pca2'])
    df_pca_train = utilities.getDataByTimeframe(df_pca, start_train, end_train)
    df_pca_test = utilities.getDataByTimeframe(df_pca, end_train, end_test)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel('PCA 1', fontsize=10)
    ax.set_ylabel('PCA 2', fontsize=10)
    ax.set_title('PCA plot', fontsize=12)
    cmap = sns.cubehelix_palette(as_cmap=True)
    indexx = list(range(df_pca_test.shape[0]))
    ax.scatter(df_pca_train['pca1'], df_pca_train['pca2'], c='lightblue')
    points = ax.scatter(df_pca_test['pca1'],
                        df_pca_test['pca2'],
                        c=indexx,
                        cmap=cmap,
                        alpha=0.4)
    fig.colorbar(points)
    plt.show()

    return pca
Пример #2
0
def main(filename):
    df = utilities.readDataFile(filename)
    df = utilities.getDataWithTimeIndex(df)
    df = df.dropna()

    subdir = filename.split('/')[-2]
    columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig(subdir)

    if relevantColumns is not None:
        df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames])

    analysis.pairplot(df)
def main(filename, start, end):
    df = utilities.readDataFile(filename)
    df = utilities.getDataWithTimeIndex(df)
    df = df.dropna()
    df = utilities.getDataByTimeframe(df, start, end)

    subdir = filename.split('/')[-2]
    columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig(
        subdir)

    if relevantColumns is not None:
        df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames])

    prints.printDataframe(df)
Пример #4
0
def main(filename, numberOfComponents):
    df = utilities.readDataFile(filename)
    df = utilities.getDataWithTimeIndex(df)
    df = df.dropna()

    subdir = filename.split('/')[-2]
    columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig(
        subdir)

    if relevantColumns is not None:
        df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames])

    prints.printEmptyLine()
    pca = analysis.pca(df, numberOfComponents, relevantColumns, labelNames)
    prints.printExplainedVarianceRatio(pca)
Пример #5
0
def main(filename):
    df = utilities.readDataFile(filename)
    df = utilities.getDataWithTimeIndex(df)
    df = df.dropna()

    subdir = filename.split('/')[-2]
    columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig(
        subdir)

    if relevantColumns is not None:
        df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames])

    prints.printEmptyLine()

    covMat = analysis.correlationMatrix(df)
    prints.printCorrelationMatrix(covMat, df, labelNames)
Пример #6
0
def main(filename):
    df = utilities.readDataFile(filename)
    df = utilities.getDataWithTimeIndex(df)
    df = df.dropna()

    subdir = filename.split('/')[-2]
    columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig(
        subdir)

    traintime, testtime, validtime = timestamps

    if relevantColumns is not None:
        df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames])

    df_train, df_test = utilities.getTestTrainSplit(df, traintime, testtime)

    analysis.valueDistribution(df_train, df_test)