def pcaPlot(filename): subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig( subdir) df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() traintime, testtime, validtime = timestamps if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) start_train, end_train = traintime start_test, end_test = testtime start_valid, end_valid = validtime df_train = utilities.getDataByTimeframe(df, start_train, end_train) train_vals = df_train.values #train_vals = df.values sc = StandardScaler() train_vals = sc.fit_transform(train_vals) numberOfComponents = 2 pca = decomposition.PCA(n_components=numberOfComponents) pca.fit(train_vals) x = df.values x = sc.transform(x) x = pca.transform(x) df_pca = pd.DataFrame(data=x, index=df.index, columns=['pca1', 'pca2']) df_pca_train = utilities.getDataByTimeframe(df_pca, start_train, end_train) df_pca_test = utilities.getDataByTimeframe(df_pca, end_train, end_test) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('PCA 1', fontsize=10) ax.set_ylabel('PCA 2', fontsize=10) ax.set_title('PCA plot', fontsize=12) cmap = sns.cubehelix_palette(as_cmap=True) indexx = list(range(df_pca_test.shape[0])) ax.scatter(df_pca_train['pca1'], df_pca_train['pca2'], c='lightblue') points = ax.scatter(df_pca_test['pca1'], df_pca_test['pca2'], c=indexx, cmap=cmap, alpha=0.4) fig.colorbar(points) plt.show() return pca
def main(filename): df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig(subdir) if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) analysis.pairplot(df)
def main(filename, start, end): df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() df = utilities.getDataByTimeframe(df, start, end) subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig( subdir) if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) prints.printDataframe(df)
def main(filename, numberOfComponents): df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig( subdir) if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) prints.printEmptyLine() pca = analysis.pca(df, numberOfComponents, relevantColumns, labelNames) prints.printExplainedVarianceRatio(pca)
def main(filename): df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig( subdir) if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) prints.printEmptyLine() covMat = analysis.correlationMatrix(df) prints.printCorrelationMatrix(covMat, df, labelNames)
def main(filename): df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig( subdir) traintime, testtime, validtime = timestamps if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) df_train, df_test = utilities.getTestTrainSplit(df, traintime, testtime) analysis.valueDistribution(df_train, df_test)