def create_reduced_tables(tables, selected_attributes): row_att_list = 0 att_names = [] columns_to_use = [] reduced_files = [] for table in tables: logger.info("Saving reduced table of: " + str(table[-15:-4])) df = pd.read_csv(table, encoding='latin1') row_headers = 0 row_att = 0 attributes = selected_attributes[row_att_list] headers = df.columns.values.tolist() for field in headers: if row_headers == attributes[row_att]: row_att = row_att + 1 columns_to_use.append(field) row_headers = row_headers + 1 reduced_files.append(table[0:-4] + "_reduced.csv") df.to_csv(reduced_files[-1], index=False, columns=columns_to_use, encoding="latin1") row_att_list = row_att_list + 1 att_names.append(columns_to_use) columns_to_use = [] return reduced_files, att_names
def join_tables(tables, relations, directory): repited_columns = [] logger.info("Reading original tables... ") df1 = pd.read_csv(directory + tables[0] + '.csv', encoding='latin1') del tables[0] headers_df1 = df1.columns.values.tolist() logger.info("Joining original tables... ") for key_table, value_table in tables.items(): df2 = pd.read_csv(directory + value_table + '.csv', encoding='latin1') headers_df2 = df2.columns.values.tolist() # This loop check and erase if there are columns with same name in the dataframes for i in headers_df1: for j in headers_df2: if i == j and i not in relations.values(): repited_columns.append(i) df2 = df2.drop( repited_columns, axis=1 ) # this delete the columns with the name of elements in repited_columns repited_columns.clear() for key_rel, value_rel in relations.items(): if key_rel == key_table: df1 = pd.merge( df1, df2, on=value_rel, how='inner') # Los campos a analizar son CASE SENSITIVE df1.to_csv(directory + "mergedTables.csv", index=False, encoding='latin1')
def create_results_page(schema): logger.info("Creating Webpage with the general results.") new_page_name = "./results/" + schema + ".html" row = 0 with open("./results/base.html", "r") as base_page: with open(new_page_name, "w") as new_page: with open("./results/results.json") as result_file: data = json.load(result_file) for line in base_page.readlines(): if line != "<!--FLAG-->\n": new_page.write(line) else: for i in range(len(data["tables"])): new_page.write('\t\t\t\t\t\t<div class="row">\n') new_page.write( '\t\t\t\t\t\t\t<div class="cell" data-title="Tabla">\n' ) new_page.write('\t\t\t\t\t\t\t\t' + data["fulltables"][i][0:-4] + '\n') new_page.write('\t\t\t\t\t\t\t</div>\n') new_page.write( '\t\t\t\t\t\t\t<div class="cell" data-title="N">\n' ) new_page.write('\t\t\t\t\t\t\t\t' + str(data["Cardinality"][i]) + '\n') new_page.write('\t\t\t\t\t\t\t</div>\n') new_page.write( '\t\t\t\t\t\t\t<div class="cell" data-title="Attributos Seleccionados">\n' ) new_page.write('\t\t\t\t\t\t\t\t<ul>\n') for att in data["SelectedAttributes"][i]: new_page.write('\t\t\t\t\t\t\t\t\t<li>' + att + "</li>\n") new_page.write('\t\t\t\t\t\t\t\t</ul>\n') new_page.write('\t\t\t\t\t\t\t</div>\n') new_page.write( '\t\t\t\t\t\t\t<div class="cell" data-title="Descargar">\n' ) new_page.write( '\t\t\t\t\t\t\t\t<a href="http://148.204.66.69/aluse/analize/files/data/' + data["tables"][i] + '" download>\n') new_page.write('\t\t\t\t\t\t\t\t\tDescargar\n') new_page.write('\t\t\t\t\t\t\t\t</a>\n') new_page.write('\t\t\t\t\t\t\t</div>\n') new_page.write( '\t\t\t\t\t\t\t<div class="cell" data-title="Descargar">\n' ) new_page.write( '\t\t\t\t\t\t\t\t<a href="http://148.204.66.69/aluse/analize/files/data/' + data["fulltables"][i] + '" download>\n') new_page.write('\t\t\t\t\t\t\t\t\tDescargar\n') new_page.write('\t\t\t\t\t\t\t\t</a>\n') new_page.write('\t\t\t\t\t\t\t</div>\n') new_page.write('\t\t\t\t\t\t</div>\n\n')
def delete_originals(tables, data_route): tables.append("catalogo") logger.info("Deleting original Files: " + str(tables)) for table in tables: file = "." + data_route + table + ".csv" if os.path.exists(file): os.remove(file) else: logger.error("The original files do not exist.")
def cfs(table, cores): loader = Loader("weka.core.converters.CSVLoader") anneal_data = loader.load_file(table) anneal_data.class_is_last() logger.info("Running attribute selection for: " + str(table.split("/")[-1]) + ". Please, wait a moment.") search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "0", "-N", "5"]) evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-Z", "-P", cores, "-E", cores]) attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) logger.info("Selected attributes: " + str(attsel.selected_attributes)) anneal_data.delete(index=None) # TO-DO: Borrar instancias aun no funciona return list(attsel.selected_attributes)
def create_result_files(att_names, reduced_files, full_files): logger.info("Creating the file which includes all results from the precesses.") data = {} data["tables"] = [] data["fulltables"] = [] data["SelectedAttributes"] = [] data["Cardinality"] = [] row = 0 for att in att_names: file = reduced_files[row].split("/") data["tables"].append(file[-1]) file = full_files[row].split("/") data["fulltables"].append(file[-1]) data["SelectedAttributes"].append(att[0:-1]) data["Cardinality"].append(len(att)-1) row = row + 1 with open('./results/results.json', 'w') as outfile: json.dump(data, outfile, indent=4)
def substitute_tables(schema, directory, catalogo): fields = [] fields_changed = [] logger.info("Reading joined tables... ") df_cat = pd.read_csv(directory + catalogo + '.csv', encoding='latin1') for i in range(len(df_cat.index)): if df_cat.iloc[i][0] not in fields: fields.append(df_cat.iloc[i][0]) df_data = pd.read_csv(directory + 'mergedTables.csv', encoding='latin1', usecols=fields) logger.info("Merging data meanings...") row = 0 for field in fields: # This IF attempts to skip those variables that already are substituted if df_cat.iloc[row]["VAL"] == "ALL": fields_changed.append(field) row = row + 1 else: fields_changed.append(field + "_") df_aux = pd.DataFrame(columns=[field, fields_changed[-1]]) while df_cat.iloc[row]["CAMPO"] == field: df_aux.loc[row] = [ df_cat.iloc[row]["VAL"], df_cat.iloc[row]["CONTENIDO"] ] row = row + 1 if row == df_cat.shape[0]: break df_aux[field] = df_aux[field].astype(numpy.int64) #Hasta aqui ya tenemos la mini tabla en aux_df df_data = pd.merge(df_data, df_aux, on=field, how='inner') df_data = df_data.loc[:, fields_changed] return df_data
def clean_inconsistencies(bigFile): lista = [] aux = [] flag = False with open(bigFile) as csvfile: logger.info("Reading ALUSE file") reader = csv.reader(csvfile) for row in reader: lista.append(row) indexes = list(range(1, len(lista))) count = 0 initialSize = len(lista) - 1 logger.info("Dataset initial Size: " + str(initialSize) + "\nStarting Analysis") inc = {} toDelete = [] for i in indexes: for j in indexes[i:]: if i != j: if lista[i][:-1] == lista[j][:-1] and lista[i][-1] != lista[j][-1]: aux.append(j) if i not in inc.keys(): inc.update({i : [j]}) else: inc[i].append(j) count += 1 labels = [lista[i][-1]] for k in aux: labels.append(lista[k][-1]) m = take_mode(labels) lista[i][-1] = m list.si for k in aux: if k not in toDelete: toDelete.append(k) aux = [] toDelete.sort() print(toDelete) deleted = [] offset = 0 for i in toDelete: if i not in deleted: del lista[i-offset] deleted.append(i) offset += 1 finalSize = len(lista) - 1 logger.info("Final size of Dataset: " + str(finalSize)) with open("new"+bigFile, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerows(lista)
def initialize_data(schema, data_directory, tables, relations, n_pv, n_areas): logger.info("Initializing all tables.") join_tables(tables, relations, os.getcwd() + data_directory) files = obtain_all_tables(schema, os.getcwd() + data_directory, "catalogo", n_pv, n_areas) return files
def obtain_all_tables(schema, directory, catalogo, n_pv, n_areas): files_routes = [] all_files = [] t1 = substitute_tables(schema, directory, catalogo) headers = t1.columns.values.tolist() logger.info("Creating tables for each Area using the Mode and the others approaches") rIndex = 0 for i in range(n_areas, 0, -1): columns_to_use = headers[ : len(headers)-(n_areas*n_pv)] logger.info("Total number of variables: " + str(len(columns_to_use))) index = n_pv*i for j in range(n_pv, 0, -1): columns_to_use.append(headers[-index + j - 1]) files_routes.append(directory + schema + "_all.csv") logger.info("Saving table with all PVs in the area of " + str(columns_to_use[-1][0:-2])) t1.to_csv(files_routes[rIndex], index=False, columns=columns_to_use, encoding='latin1') columns_to_use = columns_to_use[ : len(columns_to_use) - n_pv] for j in range(n_pv, 0, -1): columns_to_use.append(headers[-index + j - 1]) logger.info("Generating RandomColumnPV Table for: " + str(columns_to_use[-1][0:-1])) all_files.append(files_routes[rIndex][0:-8] + "_" + columns_to_use[-1][0:-1] + ".csv") t1.to_csv(files_routes[rIndex][0:-8] + "_" + columns_to_use[-1][0:-1] + ".csv", index=False, columns=columns_to_use, encoding='latin1') if j != 1: del columns_to_use[-1] logger.info("Generating Mode Table for: " + str(columns_to_use[-1][0:-2])) m = generate_moda_PV(files_routes[rIndex], columns_to_use[-1][0:-2], n_pv, directory, schema) all_files.append(m) logger.info("Generating RandomPV Table for: " + str(columns_to_use[-1][0:-2])) r = generate_random_PV(files_routes[rIndex], columns_to_use[-1][0:-2], n_pv, directory, schema) all_files.append(r) rIndex = rIndex + 1 return all_files