def build_matrix(data_id, data, cols): if data[cols].isnull().values.any(): data = data.corr(method="pearson") code = build_code_export(data_id) code.append(("corr_cols = [\n" "\t'{corr_cols}'\n" "]\n" "corr_data = df[corr_cols]\n" "{str_encodings}" "corr_data = corr_data.corr(method='pearson')")) else: # using pandas.corr proved to be quite slow on large datasets so I moved to numpy: # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow data = np.corrcoef(data[cols].values, rowvar=False) data = pd.DataFrame(data, columns=cols, index=cols) code = build_code_export( data_id, imports="import numpy as np\nimport pandas as pd\n\n") code.append(( "corr_cols = [\n" "\t'{corr_cols}'\n" "]\n" "corr_data = df[corr_cols]\n" "{str_encodings}" "corr_data = np.corrcoef(corr_data.values, rowvar=False)\n" "corr_data = pd.DataFrame(corr_data, columns=[corr_cols], index=[corr_cols])" )) code = "\n".join(code) return data, code
def build(self): base_code = build_code_export( self.data_id, imports="{}\n\n".format("\n".join([ "import numpy as np", "import pandas as pd", "import plotly.graph_objs as go", ])), ) return_data, code = self.analysis.build(self) return dict(code=build_final_chart_code(base_code + code), query=self.query, cols=global_state.get_dtypes(self.data_id), dtype=self.dtype, chart_type=self.analysis_type, **return_data)
def __init__(self, data_id, req): self.data_id = data_id self.analysis_type = get_str_arg(req, "type") curr_settings = global_state.get_settings(data_id) or {} self.query = build_query(data_id, curr_settings.get("query")) data = load_filterable_data(data_id, req, query=self.query) self.selected_col = find_selected_column( data, get_str_arg(req, "col", "values") ) self.data = data[~pd.isnull(data[self.selected_col])] self.dtype = find_dtype(self.data[self.selected_col]) self.classifier = classify_type(self.dtype) self.code = build_code_export( data_id, imports="{}\n".format( "\n".join( [ "import numpy as np", "import pandas as pd", "import plotly.graph_objs as go", ] ) ), ) if self.analysis_type is None: self.analysis_type = ( "histogram" if self.classifier in ["F", "I", "D"] else "value_counts" ) if self.analysis_type == "geolocation": self.analysis = GeolocationAnalysis(req) elif self.analysis_type == "histogram": self.analysis = HistogramAnalysis(req) elif self.analysis_type == "categories": self.analysis = CategoryAnalysis(req) elif self.analysis_type == "value_counts": self.analysis = ValueCountAnalysis(req) elif self.analysis_type == "word_value_counts": self.analysis = WordValueCountAnalysis(req) elif self.analysis_type == "qq": self.analysis = QQAnalysis()