Пример #1
0
def create_reduced_tables(tables, selected_attributes):
    row_att_list = 0
    att_names = []
    columns_to_use = []
    reduced_files = []
    for table in tables:
        logger.info("Saving reduced table of: " + str(table[-15:-4]))
        df = pd.read_csv(table, encoding='latin1')
        row_headers = 0
        row_att = 0
        attributes = selected_attributes[row_att_list]
        headers = df.columns.values.tolist()
        for field in headers:
            if row_headers == attributes[row_att]:
                row_att = row_att + 1
                columns_to_use.append(field)
            row_headers = row_headers + 1
        reduced_files.append(table[0:-4] + "_reduced.csv")
        df.to_csv(reduced_files[-1],
                  index=False,
                  columns=columns_to_use,
                  encoding="latin1")
        row_att_list = row_att_list + 1
        att_names.append(columns_to_use)
        columns_to_use = []

    return reduced_files, att_names
Пример #2
0
def join_tables(tables, relations, directory):
    repited_columns = []
    logger.info("Reading original tables... ")
    df1 = pd.read_csv(directory + tables[0] + '.csv', encoding='latin1')
    del tables[0]
    headers_df1 = df1.columns.values.tolist()

    logger.info("Joining original tables... ")
    for key_table, value_table in tables.items():
        df2 = pd.read_csv(directory + value_table + '.csv', encoding='latin1')
        headers_df2 = df2.columns.values.tolist()
        # This loop check and erase if there are columns with same name in the dataframes
        for i in headers_df1:
            for j in headers_df2:
                if i == j and i not in relations.values():
                    repited_columns.append(i)

        df2 = df2.drop(
            repited_columns, axis=1
        )  # this delete the columns with the name of elements in repited_columns
        repited_columns.clear()

        for key_rel, value_rel in relations.items():
            if key_rel == key_table:
                df1 = pd.merge(
                    df1, df2, on=value_rel,
                    how='inner')  # Los campos a analizar son CASE SENSITIVE

    df1.to_csv(directory + "mergedTables.csv", index=False, encoding='latin1')
Пример #3
0
def create_results_page(schema):
    logger.info("Creating Webpage with the general results.")
    new_page_name = "./results/" + schema + ".html"
    row = 0
    with open("./results/base.html", "r") as base_page:
        with open(new_page_name, "w") as new_page:
            with open("./results/results.json") as result_file:
                data = json.load(result_file)
                for line in base_page.readlines():
                    if line != "<!--FLAG-->\n":
                        new_page.write(line)
                    else:
                        for i in range(len(data["tables"])):
                            new_page.write('\t\t\t\t\t\t<div class="row">\n')

                            new_page.write(
                                '\t\t\t\t\t\t\t<div class="cell" data-title="Tabla">\n'
                            )
                            new_page.write('\t\t\t\t\t\t\t\t' +
                                           data["fulltables"][i][0:-4] + '\n')
                            new_page.write('\t\t\t\t\t\t\t</div>\n')

                            new_page.write(
                                '\t\t\t\t\t\t\t<div class="cell" data-title="N">\n'
                            )
                            new_page.write('\t\t\t\t\t\t\t\t' +
                                           str(data["Cardinality"][i]) + '\n')
                            new_page.write('\t\t\t\t\t\t\t</div>\n')

                            new_page.write(
                                '\t\t\t\t\t\t\t<div class="cell" data-title="Attributos Seleccionados">\n'
                            )
                            new_page.write('\t\t\t\t\t\t\t\t<ul>\n')
                            for att in data["SelectedAttributes"][i]:
                                new_page.write('\t\t\t\t\t\t\t\t\t<li>' + att +
                                               "</li>\n")
                            new_page.write('\t\t\t\t\t\t\t\t</ul>\n')
                            new_page.write('\t\t\t\t\t\t\t</div>\n')

                            new_page.write(
                                '\t\t\t\t\t\t\t<div class="cell" data-title="Descargar">\n'
                            )
                            new_page.write(
                                '\t\t\t\t\t\t\t\t<a href="http://148.204.66.69/aluse/analize/files/data/'
                                + data["tables"][i] + '" download>\n')
                            new_page.write('\t\t\t\t\t\t\t\t\tDescargar\n')
                            new_page.write('\t\t\t\t\t\t\t\t</a>\n')
                            new_page.write('\t\t\t\t\t\t\t</div>\n')

                            new_page.write(
                                '\t\t\t\t\t\t\t<div class="cell" data-title="Descargar">\n'
                            )
                            new_page.write(
                                '\t\t\t\t\t\t\t\t<a href="http://148.204.66.69/aluse/analize/files/data/'
                                + data["fulltables"][i] + '" download>\n')
                            new_page.write('\t\t\t\t\t\t\t\t\tDescargar\n')
                            new_page.write('\t\t\t\t\t\t\t\t</a>\n')
                            new_page.write('\t\t\t\t\t\t\t</div>\n')

                            new_page.write('\t\t\t\t\t\t</div>\n\n')
Пример #4
0
def delete_originals(tables, data_route):
    tables.append("catalogo")
    logger.info("Deleting original Files: " + str(tables))
    for table in tables:
        file  = "." + data_route + table + ".csv"
        if os.path.exists(file):
            os.remove(file)
        else:
            logger.error("The original files do not exist.")
Пример #5
0
def cfs(table, cores):
    loader = Loader("weka.core.converters.CSVLoader")
    anneal_data = loader.load_file(table)
    anneal_data.class_is_last()
    logger.info("Running attribute selection for: " + str(table.split("/")[-1]) + ". Please, wait a moment.")
    search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "0", "-N", "5"])
    evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-Z", "-P", cores, "-E", cores])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    logger.info("Selected attributes: " + str(attsel.selected_attributes))
    anneal_data.delete(index=None) # TO-DO: Borrar instancias aun no funciona

    return list(attsel.selected_attributes)
Пример #6
0
def create_result_files(att_names, reduced_files, full_files):
    logger.info("Creating the file which includes all results from the precesses.")
    data = {}
    data["tables"] = []
    data["fulltables"] = []
    data["SelectedAttributes"] = []
    data["Cardinality"] = []
    row = 0
    for att in att_names:
        file = reduced_files[row].split("/")
        data["tables"].append(file[-1])
        file = full_files[row].split("/")
        data["fulltables"].append(file[-1])
        data["SelectedAttributes"].append(att[0:-1])
        data["Cardinality"].append(len(att)-1)
        row = row + 1

    with open('./results/results.json', 'w') as outfile:
        json.dump(data, outfile, indent=4)
Пример #7
0
def substitute_tables(schema, directory, catalogo):
    fields = []
    fields_changed = []
    logger.info("Reading joined tables... ")
    df_cat = pd.read_csv(directory + catalogo + '.csv', encoding='latin1')
    for i in range(len(df_cat.index)):
        if df_cat.iloc[i][0] not in fields:
            fields.append(df_cat.iloc[i][0])

    df_data = pd.read_csv(directory + 'mergedTables.csv',
                          encoding='latin1',
                          usecols=fields)
    logger.info("Merging data meanings...")
    row = 0
    for field in fields:
        # This IF attempts to skip those variables that already are substituted
        if df_cat.iloc[row]["VAL"] == "ALL":
            fields_changed.append(field)
            row = row + 1
        else:
            fields_changed.append(field + "_")
            df_aux = pd.DataFrame(columns=[field, fields_changed[-1]])
            while df_cat.iloc[row]["CAMPO"] == field:
                df_aux.loc[row] = [
                    df_cat.iloc[row]["VAL"], df_cat.iloc[row]["CONTENIDO"]
                ]
                row = row + 1
                if row == df_cat.shape[0]:
                    break
            df_aux[field] = df_aux[field].astype(numpy.int64)
            #Hasta aqui ya tenemos la mini tabla en aux_df
            df_data = pd.merge(df_data, df_aux, on=field, how='inner')

    df_data = df_data.loc[:, fields_changed]

    return df_data
Пример #8
0
def clean_inconsistencies(bigFile):
    lista = []
    aux = []

    flag = False
    with open(bigFile) as csvfile:
        logger.info("Reading ALUSE file")
        reader = csv.reader(csvfile)
        for row in reader:
            lista.append(row)
        indexes = list(range(1, len(lista)))
        count = 0
        initialSize = len(lista) - 1
        logger.info("Dataset initial Size: " + str(initialSize) + "\nStarting Analysis")
        inc = {}
        toDelete = []
        for i in indexes:
            for j in indexes[i:]:
                if i != j:
                    if lista[i][:-1] == lista[j][:-1] and lista[i][-1] != lista[j][-1]:
                        aux.append(j)
                        if i not in inc.keys():
                            inc.update({i : [j]})
                        else:
                            inc[i].append(j)
                        count += 1
            labels = [lista[i][-1]]
            for k in aux:
                labels.append(lista[k][-1])
            m = take_mode(labels)
            lista[i][-1] = m
            list.si

            for k in aux:
                if k not in toDelete:
                    toDelete.append(k)
            aux = []
        toDelete.sort()
        print(toDelete)
        deleted = []
        offset = 0
        for i in toDelete:
            if i not in deleted:
                del lista[i-offset]
            deleted.append(i)
            offset += 1
        finalSize = len(lista) - 1
        logger.info("Final size of Dataset: " + str(finalSize))

    with open("new"+bigFile, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(lista)
Пример #9
0
def initialize_data(schema, data_directory, tables, relations, n_pv, n_areas):
    logger.info("Initializing all tables.")
    join_tables(tables, relations, os.getcwd() + data_directory)
    files = obtain_all_tables(schema, os.getcwd() + data_directory, "catalogo", n_pv, n_areas)

    return files
Пример #10
0
def obtain_all_tables(schema, directory, catalogo, n_pv, n_areas):
    files_routes = []
    all_files = []
    t1 = substitute_tables(schema, directory, catalogo)
    headers = t1.columns.values.tolist()
    logger.info("Creating tables for each Area using the Mode and the others approaches")
    rIndex = 0
    for i in range(n_areas, 0, -1):
        columns_to_use = headers[ : len(headers)-(n_areas*n_pv)]
        logger.info("Total number of variables: " + str(len(columns_to_use)))
        index = n_pv*i
        for j in range(n_pv, 0, -1):
            columns_to_use.append(headers[-index + j - 1])
        files_routes.append(directory + schema + "_all.csv")
        logger.info("Saving table with all PVs in the area of " + str(columns_to_use[-1][0:-2]))
        t1.to_csv(files_routes[rIndex], index=False, columns=columns_to_use, encoding='latin1')

        columns_to_use = columns_to_use[ : len(columns_to_use) - n_pv]
        for j in range(n_pv, 0, -1):
            columns_to_use.append(headers[-index + j - 1])
            logger.info("Generating RandomColumnPV Table for: " + str(columns_to_use[-1][0:-1]))
            all_files.append(files_routes[rIndex][0:-8] + "_" + columns_to_use[-1][0:-1] + ".csv")
            t1.to_csv(files_routes[rIndex][0:-8] + "_" + columns_to_use[-1][0:-1] + ".csv", index=False, columns=columns_to_use, encoding='latin1')
            if j != 1:
                del columns_to_use[-1]

        logger.info("Generating Mode Table for: " + str(columns_to_use[-1][0:-2]))
        m = generate_moda_PV(files_routes[rIndex], columns_to_use[-1][0:-2], n_pv, directory, schema)
        all_files.append(m)
        logger.info("Generating RandomPV  Table for: " + str(columns_to_use[-1][0:-2]))
        r = generate_random_PV(files_routes[rIndex], columns_to_use[-1][0:-2], n_pv, directory, schema)
        all_files.append(r)
        rIndex = rIndex + 1

    return all_files