def similarityWithUSE(messages, countries): similarity_input_placeholder = tf.placeholder(tf.string, shape=(None)) similarity_message_encodings = embed(similarity_input_placeholder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) message_embeddings_ = session.run( similarity_message_encodings, feed_dict={similarity_input_placeholder: messages}) corr = np.inner(message_embeddings_, message_embeddings_) print("correlations") for i in range(0, len(countries)): print(countries[i], corr[i]) plots.heatmap(countries, countries, corr)
def plot(): import matplotlib.pyplot as plt similarity_matrix = np.load("similarity.npy") with np.nditer(similarity_matrix, flags=["multi_index"], op_flags=["readwrite"]) as it: for value in it: row, column = it.multi_index mirror_value = similarity_matrix[column, row] if not value and mirror_value: similarity_matrix[it.multi_index] = mirror_value similarity_matrix = similarity_matrix[:100, :100] row_labels = list(range(similarity_matrix.shape[0])) column_labels = list(range(similarity_matrix.shape[1])) cmap = ListedColormap(COLOR_MAP) plots.heatmap(similarity_matrix, row_labels, column_labels, plt, cmap=cmap) plt.gcf().set_size_inches(100, 100) plt.legend() plt.savefig("test1.svg") plt.show()
def print_cluster_data(centers, labels, true_labels): row = len(centers) center_labels = [] for i in range(row): center_labels.append('Center %i' % i) col = len(np.unique(true_labels)) class_labels = [] for i in range(col): class_labels.append('Class %i' % np.unique(true_labels)[i]) centermap = np.zeros((row, col)) for i in range(row): for j in range(col): idx1 = np.where(labels == i)[0] idx2 = np.where(true_labels == np.unique(true_labels)[j])[0] idx = np.intersect1d(idx1, idx2) centermap[i, j] = len(idx) # Plot the heatmap fig, ax = plt.subplots() im = plots.heatmap(centermap, center_labels, class_labels, ax=ax) texts = plots.annotate_heatmap(im, valfmt="{x: .0f}") fig.tight_layout() plt.show()
save=True, savepath='.\\png\\plots\\histogram\\' + datasetname + '.png', close=True) plots.boxplot(df, save=True, savepath='.\\png\\plots\\boxplot\\' + datasetname + '.png', close=True) plots.scattermatrix(df, save=True, savepath='.\\png\\plots\\scattermatrix\\' + datasetname + '.png', close=True) plots.heatmap(df, save=True, savepath='.\\png\\plots\\heatmap\\' + datasetname + '.png', close=True) plots.probplot(df, save=True, savepath='.\\png\\plots\\probplot\\' + datasetname + '.png', close=True) plt.show()
def eda(filepath: str, features=None, targets=None, removeOutliers: bool = False, datasetname: str = ''): # load the data df = pk.load(open(filepath, 'rb')) # process inputs # TODO: properly infer if features or targets are a sequence or a single string if features is None: features = list(set(df.columns) - set(targets)) # examine the data print( '----------------------------------------------------------------------' ) print('{0}Shape of dataset:'.format(' ')) print( '----------------------------------------------------------------------' ) print('{0}Number of Rows: {1}'.format(' ', df.shape[0])) print('{0}Number of Columns: {1}'.format(' ', df.shape[1])) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}Column names:'.format(' ')) print( '----------------------------------------------------------------------' ) for col in df.columns: print('{0}{1}'.format(' ', col)) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}First 10 rows:'.format(' ')) print( '----------------------------------------------------------------------' ) print(df.head(10)) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}Last 10 rows:'.format(' ')) print( '----------------------------------------------------------------------' ) print(df.tail(10)) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}Statistical Summary:'.format(' ')) print( '----------------------------------------------------------------------' ) print(df.describe()) print('', end='\n\n\n') # ---------------------------------------------------------------------- # infer data types of the input DataFrame # ---------------------------------------------------------------------- colNumeric = dfutl.numericColumns(df) # ---------------------------------------------------------------------- # mean centering and scaling: standardize or normalize # ---------------------------------------------------------------------- dfNumeric = df.loc[:, colNumeric] df.loc[:, colNumeric] = (dfNumeric - dfNumeric.mean()) / dfNumeric.std() dfNumeric = df.loc[:, colNumeric] # ---------------------------------------------------------------------- # outlier detection # ---------------------------------------------------------------------- # use z-score filtering # samples that are more than 3 standard deviations away from mean are to be discarded print( '----------------------------------------------------------------------' ) print('{0}Outlier Detection:'.format(' ')) print( '----------------------------------------------------------------------' ) numouttotal = 0 numout = 1 passNum = 0 while (numout > 0): # determine the number of outliers using zscore zscores = stats.zscore(dfNumeric) idx = np.logical_not(np.logical_or(zscores < -3, zscores > 3)) idxrows = np.all(idx, axis=1) idxrowsout = np.logical_not(idxrows) numout = len(idxrows) - len(idxrows[idxrows]) print('{0}Pass {1} detected {2} outliers'.format( ' ', passNum, numout)) if not removeOutliers: break # remove outliers and contineu if (numout > 0 and removeOutliers): df = df.loc[idxrows, :] dfNumeric = df.loc[:, colNumeric] numouttotal = numouttotal + numout passNum = passNum + 1 if removeOutliers: print('{0}Total number of outliers: {1}'.format(' ', numouttotal)) print('', end='\n\n\n') # ---------------------------------------------------------------------- # visualization # ---------------------------------------------------------------------- plt.close('all') save = True if len(datasetname) > 0: savepath = '.\\png\\{0}\\eda\\'.format(datasetname) isdir = os.path.isdir(savepath) if not isdir: os.makedirs(savepath) else: savepath = '.\\png\\' plots.boxplot(dfNumeric, save=save, savepath=savepath) plots.histogram(df, tightLayout=True, save=save, savepath=savepath) plots.scattermatrix(dfNumeric, save=save, savepath=savepath) plots.heatmap(dfNumeric, correlation=0.5, save=save, savepath=savepath) #plt.show() plt.close('all') return df
plots.pairplot(stock_returns, 'stock_returns') # Distplot of MS Return in 2015 plots.distplot(returns_2015['MS Return'], bins=100, name='return_2015') # Distplot of C Return in 2008 plots.distplot(returns_C_2008, bins=100, name='returns_C_2008.png') # Lineplot for stock values plots.multi_lineplot(stock_close_reset, tickers=tickers) # Moving Averages plots.moving_average(stock_close_bac, key='BAC') # Heatmap and Clustermap with Close Price correlation plots.heatmap(close_price_corr) plots.clustermap(close_price_corr) # Create a candlestick plot for Bank of America plots.candlestick(bac_2015, 'bac_2015') # Create a candlestick plot with Bollinger Band plots.candlestick_boll(bac_2015, 'bac_2015_boll')
plots.stemleaf(df ,title = 'Stem and Leaf' ,save = True ,savepath = '.\\visual\\iris_stemleaf.txt') plots.histogram(df ,save = True ,savepath = '.\\visual\\iris_histogram.png' ,close = True) plots.boxplot(df ,save = True ,savepath = '.\\visual\\iris_boxplot.png' ,close = True) plots.scattermatrix(df ,save = True ,savepath = '.\\visual\\iris_scattermatrix.png' ,close = True) plots.heatmap(df ,save = True ,savepath = '.\\visual\\iris_heatmap.png' ,close = True) plots.probplot(df ,save = True ,savepath = '.\\visual\\iris_probplot.png' ,close = True)