Пример #1
0
    def test_float_integrity(self):
        output_path = BASE_DIR + '/output/clean_coordinates.csv'

        dc = DataCleaner(self.input_path)
        dc.clean_file([], output_path)

        raw_input = raw_csv(self.input_path)
        raw_output = raw_csv(output_path)
        self.assertEqual(raw_input, raw_output)
    def test_integration_case_1(self):
        dc = DataCleaner(get_input("integration"))
        dc.clean_file(rules, get_output("temp_integration"))

        df = pd.read_csv(get_output("temp_integration"))
        df_exp = pd.read_csv(get_output("integration"))

        self.assertEqual(set(df.columns), set(df_exp.columns))
        for col in df.columns:
            self.assertEqual(nan_safe_list(df[col]), nan_safe_list(df_exp[col]))
Пример #3
0
    def test_integration_case_1(self):
        dc = DataCleaner(get_input("integration"))
        dc.clean_file(rules, get_output("temp_integration"))

        df = pd.read_csv(get_output("temp_integration"))
        df_exp = pd.read_csv(get_output("integration"))

        self.assertEqual(set(df.columns), set(df_exp.columns))
        for col in df.columns:
            self.assertEqual(nan_safe_list(df[col]),
                             nan_safe_list(df_exp[col]))
Пример #4
0
    #     {
    #      "field": "horario_de_atencion",
    #      "replacements": {"LUN": ["lunes", "lun"],
    #                       "MAR": ["martes", "mar"],
    #                       "MIE": ["miercoles", "mie", u"miércoles"],
    #                       "JUE": ["jueves", "jue"],
    #                       "VIE": ["viernes", "vie"],
    #                       "SAB": ["sabado", "sab", "sábado","sáb"],
    #                       "DOM": ["domingo", "dom"],
    #                       "-": [" a "],
    #                       "_": [" y ", ","],
    #                       "": ["hs", "hs."],
    #                       "00:00-23:59": ["24"]
    #                      },
    #      "keep_original": True
    #     }
    #    ]}
]

dc = DataCleaner(input_path)
# No implementados aun van derecho con Pandas
dc.df['coordenadas_latitud'] = dc.df.recurso.str.split("\s+", 1,
                                                       expand=True)[0]
dc.df['coordenadas_longitud'] = dc.df.recurso.str.split("\s+", 1,
                                                        expand=True)[1]
dc.df['mail'] = dc.df['mail'].str.lower()
dc.df['sitio_web'] = dc.df.mail.str.findall('www[^ \s]+').str.join(",")
dc.df['mail'] = dc.df.mail.str.findall('[a-z_0-9\.]+@[a-z_0-9\.]+').str.join(
    ",")
dc.clean_file(rules, output_path)
def clean_file(input_path, output_path):
    """Limpia los datos del input creando un nuevo archivo limpio."""
    print("Comenzando limpieza...")
    dc = DataCleaner(input_path)
    dc.clean_file(RULES, output_path)
    print("Limpieza finalizada exitosamente!")
    # NO FUNCIONA BIEN DEJO EL CAMPO COMO ESTA
#     {"reemplazar": [
#     {
#      "field": "horario_de_atencion",
#      "replacements": {"LUN": ["lunes", "lun"], 
#                       "MAR": ["martes", "mar"],
#                       "MIE": ["miercoles", "mie", u"miércoles"],
#                       "JUE": ["jueves", "jue"],
#                       "VIE": ["viernes", "vie"],
#                       "SAB": ["sabado", "sab", "sábado","sáb"],
#                       "DOM": ["domingo", "dom"],
#                       "-": [" a "],
#                       "_": [" y ", ","],
#                       "": ["hs", "hs."],
#                       "00:00-23:59": ["24"]
#                      },
#      "keep_original": True
#     }
#    ]}

]

dc = DataCleaner(input_path)
# No implementados aun van derecho con Pandas
dc.df['coordenadas_latitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[0]
dc.df['coordenadas_longitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[1]
dc.df['mail'] = dc.df['mail'].str.lower()
dc.df['sitio_web'] = dc.df.mail.str.findall('www[^ \s]+').str.join(",")
dc.df['mail'] = dc.df.mail.str.findall('[a-z_0-9\.]+@[a-z_0-9\.]+').str.join(",")
dc.clean_file(rules, output_path)