print('') #convert categorical to numerical #Deep copy the original data df_encoded = df.copy(deep=True) #Use Scikit-learn label encoding to encode character data le = preprocessing.LabelEncoder() for col in categoricalCol: df_encoded[col] = le.fit_transform(df[col]) le_mapping = dict(zip(le.classes_, le.transform(le.classes_))) print('Feature: ', col) print(le_mapping) #drop columns drop_col = [ 'DailyRate', 'EmployeeCount', 'EmployeeNumber', 'MonthlyRate', 'Over18' ] df_encoded = df_encoded.drop(drop_col, axis=1) print(df_encoded.head()) pd.set_option('display.max_columns', 10) print(df_encoded.describe().transpose()) print('') #check for missing values print(df_encoded.isnull().sum()) df_corr = df_encoded.corr() plt.figure(figsize=(8, 8)) corrplot(df_corr, size_scale=300)
"cwl2(lr:0.015)":'{} ({})'.format(r'$CW\_l_2$', r'$lr: 0.015$'), "dfl2(os:3/255)":'{} ({})'.format(r'$DF\_l_2$', r'$overshoot: 3$'), "dfl2(os:8/255)":'{} ({})'.format(r'$DF\_l_2$', r'$overshoot: 8$'), "dfl2(os:20/255)":'{} ({})'.format(r'$DF\_l_2$', r'$overshoot: 20$'), "fgsm(eps:0.1)":'{} ({})'.format(r'$FGSM$', r'$\epsilon: 0.1$'), "fgsm(eps:0.2)":'{} ({})'.format(r'$FGSM$', r'$\epsilon: 0.2$'), "fgsm(eps:0.3)":'{} ({})'.format(r'$FGSM$', r'$\epsilon: 0.3$'), "jsma(theta:0.15)":'{} ({})'.format(r'$JSMA$', r'$\theta: 0.15$'), "jsma(theta:0.18)":'{} ({})'.format(r'$JSMA$', r'$\theta: 0.18$'), "jsma(theta:0.21)":'{} ({})'.format(r'$JSMA$', r'$\theta: 0.21$'), "mim(eps:0.05)":'{} ({})'.format(r'$MIM$', r'$\epsilon: 0.05$'), "mim(eps:0.075)":'{} ({})'.format(r'$MIM$', r'$\epsilon: 0.075$'), "mim(eps:0.1)":'{} ({})'.format(r'$MIM$', r'$\epsilon: 0.1$'), "onepixel(pxCnt:5)":'{} ({})'.format(r'$OP$', r'$px~count: 5$'), "onepixel(pxCnt:15)":'{} ({})'.format(r'$OP$', r'$px~count: 15$'), "onepixel(pxCnt:30)":'{} ({})'.format(r'$OP$', r'$px~count: 30$'), "pgd(eps:0.075)":'{} ({})'.format(r'$PGD$', r'$\epsilon: 0.075$'), "pgd(eps:0.09)":'{} ({})'.format(r'$PGD$', r'$\epsilon: 0.09$'), "pgd(eps:0.1)":'{} ({})'.format(r'$PGD$', r'$\epsilon: 0.1$'), "BS":'Benign Samples', } rank_corr=rank_corr.rename(columns=titles_for_attacks) rank_corr=rank_corr.rename(index=titles_for_attacks) filepath=os.path.join(resultDir, "rank_correlation_plot.pdf") plt.figure(figsize=(50, 50)) corrplot(rank_corr, filepath, size_scale=36, palette=sns.diverging_palette(5, 250, n=256))
print(DF.tail(20)) DF = DF.drop(columns=['Element1', 'Element2']) DF = DF.sort_values(by='Class') #DF["name"] = DF["compound"] + '_group' + DF["Class"].astype(str) #DF = DF[['name', 'Pauling EN', 'Sum of Valence e-', 'Mean atomic number', 'Mean atomic radius', 'Atomic radius ratio', 'Group Number difference', 'Quantum number difference']] #print(DF.tail(20)) #DF.to_csv('np.txt', sep=' ', index=False, header=True) #Plot Heatmap for some features corr = DF.iloc[:, np.arange(3, 58, 1)].corr() plt.figure(figsize=(30, 30)) heatmap.corrplot(corr) plt.savefig('pairplot.png', dpi=400) plt.show() ''' #Define features and target X = DF.iloc[:, list(range(5, 55))] y = DF.iloc[:, 3] #30/70 Test/Train stratified split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=20, stratify=y) #Standardize Features from sklearn.preprocessing import StandardScaler sc = StandardScaler()
def corr_heatmap(data, figsize=(10, 10)): # medium/Better_Heatmaps_and_Correlation_Matrix_Plots_in_Python plt.figure(figsize=figsize) corrplot(data.corr())
# Total hours of data collection # In[14]: round(max(df["Time"])/(60*60), 1) # Creating the correlation plot # In[15]: plt.figure(figsize=(14,14)) corrplot(df.corr()) plt.grid(False) # Looking at the distribution only at the non-fraudulent payment amounts # In[16]: fig, ax = plt.subplots(1, 2, figsize=(14,4)) sns.distplot(df.Amount[df["Class"] == 0], bins=100, kde=False, hist_kws={"color": "#3f8094","linewidth": 0.4, "alpha": 1}, ax=ax[0]) sns.distplot(df.Amount[df["Amount"] <= 250], bins=100, kde=False, hist_kws={"color": "#3f8094", "linewidth": 0.4, "alpha": 1}, ax=ax[1])
ds.describe() # View all possible variable correlations with Salary correlation = ds.corr().sort_values(by='2018_2019_Salary', ascending=False) correlation['2018_2019_Salary'] """Variable Correlation Plot""" #View all possible variable correlations with Salary correlation = ds.corr(method='pearson', min_periods=1) rs_val = correlation**2 #Generating heatmap of peason correlation values plt.figure(figsize=(12, 12)) corrplot(correlation, size_scale=300) plt.title("Heatmap 1 – Pearson Correlation", x=-8, y=1) plt.show() """Top 8 variables""" #Finding the top 8 variables with the highest correaltion with salary num_vals = 9 larg = rs_val.nlargest(num_vals, '2018_2019_Salary')['2018_2019_Salary'] c = larg.index csquared_val = ds[c].corr()**2 #generating heatmap of top 8 features correlated with salary f, ax = plt.subplots(figsize=(12, 12))
import numpy as np import seaborn as sns import matplotlib.pyplot as plt import pandas as pd from heatmap import heatmap, corrplot data = pd.read_csv('met-data.csv') corr = data.corr() corr = pd.melt( corr.reset_index(), id_vars='index' ) # Unpivot the dataframe, so we can get pair of arrays for x and y corr.columns = ['x', 'y', 'value'] heatmap(x=corr['x'], y=corr['y'], size=corr['value'].abs()) plt.figure(figsize=(10, 10)) corrplot(data.corr()) plt.show()
dataset = pd.read_csv('cov19_clean.csv') #Visualize the data dataset.info() dataset.describe() dataset.head() #Correlation Matrix dataset.iloc[:,:-1].corrwith(dataset.InfectRate).plot.line() plt.xticks(ticks = range(len(dataset.columns)-1), labels = ['population_density', 'median_age', 'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'cvd_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'handwashing_facilities', 'hospital_beds_per_100k'],rotation = 45) plt.title('Variable correlating to the Infection Rate ') corrplot(dataset.corr(), size_scale=500, marker='s') #Data Preprocessing column_names_x = pd.DataFrame(dataset.iloc[:, 1:-1].columns.values) x = pd.DataFrame(dataset.iloc[:, 1:-1].values) y = dataset.iloc[:, -1].values x.columns = column_names_x.values #Feature scaling is not necessary for this Linear Regression #Split into training and testing sets from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = .2, random_state = 0) #Apply Linear Regression from sklearn.linear_model import LinearRegression regressor = LinearRegression()
for i in range(1, len(new_helix)): maxsort = np.append(maxsort, np.argmax(new_helix[i])) mat = np.zeros((len(hair_d.array) - 1, len(hair_d.array) - 1)) for i in range(1, len(hair_d.array)): for j in range(1, i): mat[i - 1, j - 1] = histogram_intersection(hair_d.array[i], hair_d.array[j]) mat[j - 1, i - 1] = mat[i - 1, j - 1] mat[i - 1, i - 1] = 1 mat[0, 0] = 1 plt.imshow(mat) plt.show() df = pd.DataFrame(mat) corrplot(df) plt.show() mat = np.zeros((len(new_helix.T), len(new_helix.T))) for i in range(0, len(new_helix.T)): for j in range(0, i): mat[i, j] = histogram_intersection(new_helix.T[i], new_helix.T[j]) mat[j, i] = mat[i, j] mat[i, i] = 1 mat[0, 0] = 1 plt.imshow(mat) plt.show() df = pd.DataFrame(mat) corrplot(df) #, segid=i+1) plt.show()
margin=dict(r=0, l=210, t=25, b=210), yaxis=dict(tickfont=dict(size=9)), xaxis=dict(tickfont=dict(size=9)))) data = [trace] fig = go.Figure(data=data, layout=layout) py.iplot(fig) # https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec # Correlation matrix with heatmapz from heatmap import heatmap, corrplot data = pd.read_csv("telco_customer_churn.csv") plt.figure(figsize=(8, 8)) corrplot(data.corr(), size_scale=300) # Contracts information (month-to-month contract, two year contract, one year contract) ax1 = sns.distplot(data[data["Contract"] == "Month-to-month"]["tenure"], hist=True, kde=False, bins=int(180 / 5), hist_kws={'edgecolor': 'black'}, kde_kws={'linewidth': 4}) ax1.set_ylabel('Number of Customers') ax1.set_xlabel('Tenure (months)') ax1.set_title('Month-to-month Contract') ax2 = sns.distplot(data[data["Contract"] == "One year"]["tenure"], hist=True, kde=False,