def main(tcu_fpath, col_name): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) column = data[col_name] dtype = column.dtype masked = None if dtype == 'i' or dtype == 'f': masked = ma.masked_invalid(column) x, y = ecdf(masked) plt.plot(x, y, 'bo') plt.show() else: #Simple hack for the string case. #Creates a copy with masked values deleted. masked = column[column != 'N/A'] cat, y = categorical_hist(masked) x = range(1, len(cat) + 1) plt.bar(x, y, width = 0.5) plt.xticks(x, cat) plt.show()
def test_load_untreated(self): '''Checks if all lines are read, if headers are good and if N/A were replaced for nan in numeric fields''' data = tcu_io.load_untreated_csv_to_numpy(ORIGINAL_TCU_FILE) self.assertEquals(len(data), 37165) names = ('ChavePregao', 'UASG', 'PregoeiroOficial', 'Descricao', 'DescricaoComplementar', 'Quantidade', 'UnidadeFornecimento', 'ValordeReferencia', 'Situacao', 'AceitoPara','PeloMenorLance', 'ValorNegociado', 'GanhoNegociacao', 'GanhoPregao', 'AceitoPara_CNPJ') expected_num_nas = 0 with open(ORIGINAL_TCU_FILE) as tcu_file: for line in tcu_file: spl = line.split(',') for token in spl: if token.strip() == 'N/A': expected_num_nas += 1 actual_num_nas = 0 for column_name in names: self.assertEquals(len(data[column_name]), 37165) atype = data[column_name].dtype if atype != 'i' and atype != 'f': matching = data[column_name] == 'N/A' else: matching = np.isnan(data[column_name]) actual_num_nas += len(data[column_name][matching]) self.assertEquals(expected_num_nas, actual_num_nas)
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) data = data[data['Situacao'] == 'Aceito e Habilitado'] #Checks if gain is correct values_ref = ma.masked_invalid(data['ValordeReferencia']) values_neg = ma.masked_invalid(data['ValorNegociado']) values_gain = ma.masked_invalid(data['GanhoPregao']) computed_gain = (values_ref - values_neg) * 100 / values_ref i = 1 for actual, computed in zip(values_gain, computed_gain): if abs(computed - actual) / actual > 0.01: print('Invalid line %d' %i, computed, actual) i += 1 #Checks if only one CNPJ exists per person aceito_names = data['AceitoPara'] aceito_cnpj = data['AceitoPara_CNPJ'] uniq_check_cnpj = defaultdict(set) for cnpj, name in zip(aceito_cnpj, aceito_names): uniq_check_cnpj[cnpj].add(name) for cnpj in uniq_check_cnpj: if len(uniq_check_cnpj[cnpj]) > 1: print('Duplicado CNPJ', cnpj, '->') for name in uniq_check_cnpj[cnpj]: print('\t', name) #Checks if ChavePregao maps to one cnpj only chaves = data['ChavePregao'] uniq_check_chave = defaultdict(set) for chave, cnpj in zip(chaves, aceito_cnpj): uniq_check_chave[cnpj].add(cnpj) for chave in uniq_check_chave: if len(uniq_check_chave[chave]) > 1: print('Duplicado CHAVE', chave, '->') for cnpj in uniq_check_chave[chave]: print('\t', cnpj)
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) #We only want accepted data data = data[data['Situacao'] == 'Aceito e Habilitado'] #Get invalid lines invalids = invalid(data) #Transforms descriptions to base strings desc_column = data['Descricao'] des_cmp_column = data['DescricaoComplementar'] unidade_column = data['UnidadeFornecimento'] qtd_column = [str(qtd) for qtd in data['Quantidade']] as_docs = [] for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column): doc = " ".join(as_text) as_docs.append(doc) #Vectorizes to TF-IDF vectorizer = Vectorizer() doc_sparse_matrix = vectorizer.fit_transform(as_docs) #Run K-Means num_clusters = 7 mbkm = MiniBatchKMeans(num_clusters, init = 'random') mbkm.fit(doc_sparse_matrix) #New labels column, replaces both Descricao columns labels_column = mbkm.labels_ #Old columns to keep chave_column = data['ChavePregao'] uasg_column = data['UASG'] pregoeiro_column = data['PregoeiroOficial'] aceito_column = data['AceitoPara_CNPJ'] lance_column = data['PeloMenorLance'] ref_column = data['ValordeReferencia'] ganho_column = data['GanhoPregao'] #And a new column Superfaturamento super_faturamento = np.ndarray(shape=len(labels_column), dtype = 'S12') for i, ganho in enumerate(ganho_column): if ganho >= -50: #50% vezes o preco é aceito super_faturamento[i] = 'OK' elif ganho < -50 and ganho > -500: #Mais que isso é super faturado super_faturamento[i] = 'Super' elif ganho < -500: #Mais que 5x o valor é foda. super_faturamento[i] = 'SuperPlus' for i in xrange(len(labels_column)): if i not in invalids: print(labels_column[i], end=',') print(chave_column[i], end=',') print(uasg_column[i], end=',') print(pregoeiro_column[i], end=',') print(aceito_column[i], end=',') print(lance_column[i], end=',') print(ref_column[i], end=',') print(ganho_column[i], end=',') print(super_faturamento[i])
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) data = data[data['Situacao'] == 'Aceito e Habilitado'] desc_column = data['Descricao'] des_cmp_column = data['DescricaoComplementar'] unidade_column = data['UnidadeFornecimento'] qtd_column = [str(qtd) for qtd in data['Quantidade']] #Transforms descriptions to base strings as_docs = [] for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column): doc = " ".join(as_text) as_docs.append(doc) #Vectorizes to TF-IDF vectorizer = Vectorizer() doc_sparse_matrix = vectorizer.fit_transform(as_docs) #Compute clusters inter = {} intra = {} n_runs = 20 k_vals = range(2, 16) for i in xrange(n_runs): for k in k_vals: #Each K has n_runs clusterings inter_array = inter.setdefault(k, np.zeros(n_runs)) intra_array = intra.setdefault(k, np.zeros(n_runs)) #Run K-Means mbkm = MiniBatchKMeans(k, init = 'random') mbkm.fit(doc_sparse_matrix) centers = mbkm.cluster_centers_ labels = mbkm.labels_ #Inter distance. We use min because the ideia is to maximize this. #Min serves as a penalty for worse case. dist_centers = pairwise.euclidean_distances(centers) min_dist_between_centers = \ np.min(dist_centers[dist_centers > 0]) inter_array[i] = min_dist_between_centers #Intra distance dist_all_centers = mbkm.transform(doc_sparse_matrix) intra_dists = [] for doc_id, cluster in enumerate(labels): dist = dist_all_centers[doc_id, cluster] intra_dists.append(dist) intra_array[i] = np.mean(intra_dists) #Prints num elements per cluster print('Run %d ; k = %d' %(i, k)) counter = Counter(labels) for cluster, population in counter.items(): print('\tK = %d; Pop = %d' %(cluster, population)) print() x = inter.keys() y = [] c = [] for k in x: div = inter[k] / intra[k] y.append(np.mean(div)) c.append(half_confidence_interval_size(div, 0.90)) #hack for the zero to apper x = [0] + x y = [0] + y c = [0] + c ax = plt.gca() ax.set_yscale('log') ax.set_xticks(range(0, 16)) plt.ylabel('InterCluster/IntraCluster Ratio') plt.xlabel('Number of clusters') plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2) plt.show()