def test_remover_filas_duplicadas_based_on_field(self): input_path = get_input("filas_duplicadas_con_id") output_path = get_output("filas_duplicadas_con_id") dc = DataCleaner(input_path) df = dc.remover_filas_duplicadas(all_fields=False, fields=["id"]) expected_df = DataCleaner(output_path).df self.assertTrue(df.equals(expected_df))
def test_remover_filas_duplicadas(self): input_path = get_input("filas_duplicadas") output_path = get_output("filas_duplicadas") dc = DataCleaner(input_path) df = dc.remover_filas_duplicadas(all_fields=True) expected_df = DataCleaner(output_path).df self.assertTrue(df.equals(expected_df))
def test_simplify_geometry(self): input_path = BASE_DIR + '/input/localidades/localidades.shp' original = BASE_DIR + '/output/localidades-original.csv' simplified = BASE_DIR + '/output/localidades-simplificado.csv' dc = DataCleaner(input_path) dc.save(original) # CSV con geometría original. dc = DataCleaner(input_path) dc.simplificar_geometria() dc.save(simplified) # CSV con geometría simplificada. import filecmp files_are_equal = filecmp.cmp(original, simplified, shallow=False) self.assertFalse(files_are_equal)
def word_notin_vocab(): # read vocab print ('start getting /word ') input_size = 16 window_size = 2 embedding_dim = 50 batch_size_word2vec = 8 file_to_save_word2vec_data = 'word2vec_ver6/ws-' + str(window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str(batch_size_word2vec) + '.pkl' vectors, word2int, int2word = read_trained_data(file_to_save_word2vec_data) # read all sentences in unknown file texts = [] print ("Current day: %d" % now.day) print ("Current year: %d" % now.year) print ("Current month: %d" % now.month) with open(unknown_file_name, encoding="utf8") as file: for line in file : temp = line.split(",",1) temp[1] = temp[1].lower() texts.append(temp[1]) #list of train_word words_notin_vocab = [] for text in texts: data_cleaner = DataCleaner(text) all_words = data_cleaner.separate_sentence() for word in all_words: if word not in word2int: words_notin_vocabi = [] words_notin_vocabi.append(word) words_notin_vocabi.append(text) words_notin_vocab.append(words_notin_vocabi) return jsonify(results = words_notin_vocab)
def test_get_encoding(self): input_path = BASE_DIR + '/input/non_unicode.csv' dc = DataCleaner(input_path) encoding = dc._get_file_encoding(input_path) self.assertNotEqual(encoding, 'utf-8')
def test_get_api_response(self): """Realiza un búsquedas sobre una entidad territorial.""" entity = 'localidad' data_test = { 'localidades': [{ 'nombre': 'laferrere', 'aplanar': True, 'provincia': 'buenos aires', 'max': 1 }] } res_test = [{ 'localidades': [{ u'departamento_nombre': u'La Matanza', u'tipo': u'Entidad (E)', u'centroide_lon': -58.592533, u'municipio_nombre': u'La Matanza', u'provincia_id': u'06', u'departamento_id': u'06427', u'id': u'06427010004', u'centroide_lat': -34.746838, u'provincia_nombre': u'Buenos Aires', u'nombre': u'GREGORIO DE LAFERRERE', u'municipio_id': u'060427' }] }] input_path = get_input('normalize_unidad_territorial') dc = DataCleaner(input_path) res = dc._get_api_response(entity, data_test) self.assertEqual(res_test, res)
def __init__(self): self.stock_code = [] self.stock_name = [] file_name = './data/stockslist.txt' self.read_stock_data(file_name) self.help_subject = ["xem", "xem cho tôi", "cho tôi xem"] self.chu_ngu = [ "tôi có nhu cầu ", "tao muốn", "", "mình cần", "tôi cần", "mình muốn", "đặt lênh" ] self.actions = [ "mua", "bán", "chuyển nhượng", "sang tên", "đầu tư thêm", "gom", "thêm", "mua thêm" ] self.amounts = ["", "khối lượng ", "số lượng"] self.sub_amounts = ["", "cái", "cổ phiếu", "cổ"] self.words = ["tôi muốn", "bán", "mã", "khối lương", "giá"] self.price_prefix = ["giá", "", "với giá", "tại"] self.currency_unit = ["", "nghìn đồng", "vnđ", "nghìn"] self.prefix = ["nhận định", "tình hình", "thông tin", ""] self.suffix = ["biến động", "lên xuống"] self.quesword = ["thế nào", "ra sao", ""] self.infix = ["mã chứng khoán", "mã", "cổ phiếu", "mã cổ phiếu"] self.balance_word = ["", "còn dư", "dư"] self.stock_prefix = ["", "mã", "số"] self.conjunction = ["", "và"] self.advice_prefix = ["có", "nên", "có nên"] self.cash_prefix = ["tài khoản"] self.cash_infix = ["đuôi"] self.check_stopword = DataCleaner()
def clean_file(input_path, output_path): """Limpia los datos del input creando un nuevo archivo limpio.""" print("Comenzando limpieza...") dc = DataCleaner(input_path, encoding='latin1') custom_cleaning_before_rules(dc) dc.clean(RULES) custom_cleaning_after_rules(dc) y = 2015 dc.df.hasta = pd.to_datetime(dc.df.hasta, yearfirst=True) dc.df.desde = pd.to_datetime(dc.df.desde, yearfirst=True) gii = dc.df.desde.dt.year == y gif = dc.df.hasta.dt.year == y gis = (dc.df.desde.dt.year < y) & (dc.df.hasta.dt.year > y) givig = gii | gif | gis df1 = dc.df[givig].copy() print("La cantida de registros 2015 es: ") print(givig.sum()) gin2016 = dc.df.desde.dt.year == 2016 df2 = dc.df[~gin2016].copy() print("La cantida de registros historicos es: ") print((~gin2016).sum()) df1.to_csv( DEFAULT_OUTPUT_PATH_VIGENTE, encoding=dc.OUTPUT_ENCODING, separator=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR, index=False) df2.to_csv( DEFAULT_OUTPUT_PATH1_HISTORICO, encoding=dc.OUTPUT_ENCODING, separator=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR, index=False) print("Limpieza finalizada exitosamente!")
def test_string_peg_split(self): input_path = get_input("string_separable_complejo") output_path = get_output("string_separable_complejo") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) parsed_df = dc.string_peg_split( "solicitante", """ allowed_char = anything:x ?(x not in '1234567890() ') nombre = ~('DNI') <allowed_char+>:n ws -> n.strip() number = <digit+>:num -> int(num) nom_comp = <nombre+>:nc -> nc.strip() cargo = '(' <nombre+>:c ')' -> c.strip() dni = ','? ws 'DNI' ws number:num -> num values = nom_comp:n ws cargo?:c ws dni?:d ws anything* -> [n, c, d] """, ["nombre", "cargo", "dni"]) res_1 = nan_safe_list(parsed_df["solicitante_nombre"]) res_2 = nan_safe_list(parsed_df["solicitante_cargo"]) res_3 = nan_safe_list(parsed_df["solicitante_dni"]) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp_1 = nan_safe_list(df["solicitante_nombre"]) exp_2 = nan_safe_list(df["solicitante_cargo"]) exp_3 = nan_safe_list(df["solicitante_dni"]) self.assertEqual(res_1, exp_1) self.assertEqual(res_2, exp_2) self.assertEqual(res_3, exp_3)
def tokenize(self): data_cleaner = DataCleaner(self.corpus) all_word, all_sentence_split = data_cleaner.clean_content() print ('all_word') print (all_word) # print ('all_sentence_split') # print (all_sentence_split) return all_word, all_sentence_split
def test_removing_line_breaks(self): input_path = get_input("with_line_breaks") output_path = get_output("with_line_breaks") dc = DataCleaner(input_path) df = pd.read_csv(output_path, encoding="utf-8") self.assertEqual(list(dc.df.columna), list(df.columna))
def test_cleaning_fields(self): input_path = get_input("fields") output_path = get_output("fields") dc = DataCleaner(input_path) df = pd.read_csv(output_path, encoding="utf-8") self.assertEqual(set(dc.df.columns), set(df.columns))
def test_shapefile_to_csv(self): output_path = BASE_DIR + '/output/localidades.csv' dc = DataCleaner(self.input_path) dc.save(output_path) csv_df = pd.read_csv(output_path) self.assertEqual(set(csv_df.columns), set(dc.df.columns))
def test_shapefile_to_geojson(self): output_path = BASE_DIR + '/output/localidades.geojson' dc = DataCleaner(self.input_path) dc.save(output_path) geojson_df = gpd.read_file(output_path, driver='GeoJSON') self.assertEqual(set(geojson_df.columns), set(dc.df.columns))
def test_remover_columnas(self): input_path = get_input("nombre_propio") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) df = dc.remover_columnas(field) self.assertNotIn(field, df.columns)
def test_shapefile_to_kml(self): output_path = BASE_DIR + '/output/localidades.kml' dc = DataCleaner(self.input_path) dc.save(output_path) with open(output_path) as kml_file: kml = kml_file.read() assert kml.startswith('<?xml version="1.0" encoding="utf-8" ?>')
def test_nombre_propio_keep_original(self): input_path = get_input("nombre_propio") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) dc.nombre_propio(field, keep_original=True, inplace=True) self.assertIn("dependencia_normalizado", dc.df.columns)
def test_float_integrity(self): output_path = BASE_DIR + '/output/clean_coordinates.csv' dc = DataCleaner(self.input_path) dc.clean_file([], output_path) raw_input = raw_csv(self.input_path) raw_output = raw_csv(output_path) self.assertEqual(raw_input, raw_output)
def clean_file(input_path, output_path): """Limpia los datos del input creando un nuevo archivo limpio.""" print("Comenzando limpieza...") dc = DataCleaner(input_path) custom_cleaning_before_rules(dc) dc.clean(RULES) custom_cleaning_after_rules(dc) dc.save(output_path) print("Limpieza finalizada exitosamente!")
def apply_rules_to_dataset(csv_input, csv_output, dataset_file_rules, parse_options): with warnings.catch_warnings(record=True) as catched_warnings: dc = DataCleaner(csv_input, **parse_options) dc.clean(dataset_file_rules['data-cleaner-rules']) dc.df.set_index(dc.df.columns[0]).to_csv( csv_output, encoding=dc.OUTPUT_ENCODING, sep=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR ) return catched_warnings
def test_integration_case_1(self): dc = DataCleaner(get_input("integration")) dc.clean_file(rules, get_output("temp_integration")) df = pd.read_csv(get_output("temp_integration")) df_exp = pd.read_csv(get_output("integration")) self.assertEqual(set(df.columns), set(df_exp.columns)) for col in df.columns: self.assertEqual(nan_safe_list(df[col]), nan_safe_list(df_exp[col]))
def test_string_regex_substitute(self): input_path = get_input("regex_sub") output_path = get_output("regex_sub") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.string_regex_substitute("lugar_audiencia", "\d+.*$", "") res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df["lugar_audiencia"]) self.assertEqual(res, exp)
def setUp(self): """ Creates a new database for the unit test to use """ app.config.from_pyfile('test_config.py') db.init_app(app) db.create_all() self.dataCleaner = DataCleaner(test_config.SQLALCHEMY_DATABASE_URI) self.app = app.test_client() return self.app
def test_fecha_completa_keep_original(self): input_path = get_input("fecha_completa") field = "fecha_completa_audiencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) dc.fecha_completa(field, "DD-MM-YYYY HH:mm", keep_original=True, inplace=True) self.assertIn("isodatetime_fecha_completa_audiencia", dc.df.columns)
def test_nombre_propio(self): input_path = get_input("nombre_propio") output_path = get_output("nombre_propio") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.nombre_propio(field) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_string_normal(self): input_path = get_input("string_normal") output_path = get_output("string_normal") field = "lugar_audiencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.string(field) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_fecha_simple_mes(self): input_path = get_input("fecha_mes") output_path = get_output("fecha_mes") field = "fecha_audiencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.fecha_simple(field, "MM-YYYY") res = nan_to_empty_string_list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = nan_to_empty_string_list(df["isodate_" + field]) self.assertEqual(res, exp)
def test_reemplazar_string(self): input_path = get_input("reemplazar_string") output_path = get_output("reemplazar_string") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.reemplazar_string(field, {"Jaguarete": ["ABBA", "ABBBA"]}) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_mail_format(self): input_path = get_input("mail_format") output_path = get_output("mail_format") field = "mail" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.mail_format(field) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_build_data(self): """Construye un diccionario con unidades territoriales.""" entity = str('localidad') field = str('nombre') test_data = { 'localidades': [{ 'nombre': 'laferrere', 'aplanar': True, 'max': 1 }] } input_path = get_input('normalize_unidad_territorial') dc = DataCleaner(input_path) data = dc._build_data(field, entity, filters={}) self.assertEqual(data, test_data)