def test_load_treated(self): data = tcu_io.load_treated_csv_to_numpy(TREATED_TCU_FILE) self.assertEquals(29202, len(data)) names = ('Cluster', 'ChavePregao', 'UASG', 'PregoeiroOficial', 'AceitoPara_CNPJ', 'PeloMenorLance', 'ValorReferencia', 'GanhoPregao', 'SuperFaturamento') for column_name in names: self.assertEquals(len(data[column_name]), 29202)
def main(tcu_fpath): data = tcu_io.load_treated_csv_to_numpy(tcu_fpath) print(stats.pearsonr(data['ValorReferencia'], data['PeloMenorLance'])) print(stats.pearsonr(data['ValorReferencia'], data['GanhoPregao'])) print(stats.pearsonr(data['GanhoPregao'], data['PeloMenorLance'])) to_corr_cat = [('SuperFaturamento', 'PregoeiroOficial'), ('SuperFaturamento', 'AceitoPara_CNPJ'), ('SuperFaturamento', 'Cluster')] for pair in to_corr_cat: row = pair[0] col = pair[1] vals_row = set(data[row]) vals_col = set(data[col]) n_rows = len(vals_row) n_cols = len(vals_col) #Creating table contingency_table = np.ndarray(shape=(n_rows, n_cols), dtype='i') for i, possible_row in enumerate(vals_row): for j, possible_col in enumerate(vals_col): #Create boolean arrays with lines that contain the values with_both = \ (data[row] == possible_row) & (data[col] == possible_col) num_occur = with_both.sum() contingency_table[i, j] = num_occur print('Correlating %s', pair) chi2, p, dof, e = contingency.chi2_contingency(contingency_table) print('Correlation', pair, ': chi2 = %f; p = %f; df = %f;' \ %(chi2, p, dof))