예제 #1
0
    def test_remover_filas_duplicadas_based_on_field(self):
        input_path = get_input("filas_duplicadas_con_id")
        output_path = get_output("filas_duplicadas_con_id")

        dc = DataCleaner(input_path)
        df = dc.remover_filas_duplicadas(all_fields=False, fields=["id"])
        expected_df = DataCleaner(output_path).df

        self.assertTrue(df.equals(expected_df))
예제 #2
0
    def test_remover_filas_duplicadas(self):
        input_path = get_input("filas_duplicadas")
        output_path = get_output("filas_duplicadas")

        dc = DataCleaner(input_path)
        df = dc.remover_filas_duplicadas(all_fields=True)
        expected_df = DataCleaner(output_path).df

        self.assertTrue(df.equals(expected_df))
예제 #3
0
    def test_simplify_geometry(self):
        input_path = BASE_DIR + '/input/localidades/localidades.shp'
        original = BASE_DIR + '/output/localidades-original.csv'
        simplified = BASE_DIR + '/output/localidades-simplificado.csv'

        dc = DataCleaner(input_path)
        dc.save(original)  # CSV con geometría original.
        dc = DataCleaner(input_path)
        dc.simplificar_geometria()
        dc.save(simplified)  # CSV con geometría simplificada.

        import filecmp
        files_are_equal = filecmp.cmp(original, simplified, shallow=False)
        self.assertFalse(files_are_equal)
예제 #4
0
def word_notin_vocab():
    # read vocab
    print ('start getting /word ')
    input_size = 16
    window_size = 2
    embedding_dim = 50
    batch_size_word2vec = 8
    file_to_save_word2vec_data = 'word2vec_ver6/ws-' + str(window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str(batch_size_word2vec) + '.pkl'
    vectors, word2int, int2word = read_trained_data(file_to_save_word2vec_data)
    # read all sentences in unknown file
    texts = []
    print ("Current day: %d" % now.day)
    print ("Current year: %d" % now.year)
    print ("Current month: %d" % now.month)
    
    with open(unknown_file_name, encoding="utf8") as file:
        for line in file :
            temp = line.split(",",1)
            temp[1] = temp[1].lower()
            texts.append(temp[1])  #list of train_word
    words_notin_vocab = []
    for text in texts:
        data_cleaner = DataCleaner(text)
        all_words = data_cleaner.separate_sentence()   
        for word in all_words:
            if word not in word2int:
                words_notin_vocabi = []
                words_notin_vocabi.append(word)
                words_notin_vocabi.append(text)
                words_notin_vocab.append(words_notin_vocabi)
    return jsonify(results = words_notin_vocab) 
예제 #5
0
    def test_get_encoding(self):
        input_path = BASE_DIR + '/input/non_unicode.csv'

        dc = DataCleaner(input_path)
        encoding = dc._get_file_encoding(input_path)

        self.assertNotEqual(encoding, 'utf-8')
예제 #6
0
    def test_get_api_response(self):
        """Realiza un búsquedas sobre una entidad territorial."""
        entity = 'localidad'
        data_test = {
            'localidades': [{
                'nombre': 'laferrere',
                'aplanar': True,
                'provincia': 'buenos aires',
                'max': 1
            }]
        }
        res_test = [{
            'localidades': [{
                u'departamento_nombre': u'La Matanza',
                u'tipo': u'Entidad (E)',
                u'centroide_lon': -58.592533,
                u'municipio_nombre': u'La Matanza',
                u'provincia_id': u'06',
                u'departamento_id': u'06427',
                u'id': u'06427010004',
                u'centroide_lat': -34.746838,
                u'provincia_nombre': u'Buenos Aires',
                u'nombre': u'GREGORIO DE LAFERRERE',
                u'municipio_id': u'060427'
            }]
        }]

        input_path = get_input('normalize_unidad_territorial')
        dc = DataCleaner(input_path)
        res = dc._get_api_response(entity, data_test)
        self.assertEqual(res_test, res)
예제 #7
0
    def __init__(self):
        self.stock_code = []
        self.stock_name = []
        file_name = './data/stockslist.txt'
        self.read_stock_data(file_name)
        self.help_subject = ["xem", "xem cho tôi", "cho tôi xem"]
        self.chu_ngu = [
            "tôi có nhu cầu ", "tao muốn", "", "mình cần", "tôi cần",
            "mình muốn", "đặt lênh"
        ]
        self.actions = [
            "mua", "bán", "chuyển nhượng", "sang tên", "đầu tư thêm", "gom",
            "thêm", "mua thêm"
        ]
        self.amounts = ["", "khối lượng ", "số lượng"]
        self.sub_amounts = ["", "cái", "cổ phiếu", "cổ"]
        self.words = ["tôi muốn", "bán", "mã", "khối lương", "giá"]
        self.price_prefix = ["giá", "", "với giá", "tại"]
        self.currency_unit = ["", "nghìn đồng", "vnđ", "nghìn"]
        self.prefix = ["nhận định", "tình hình", "thông tin", ""]
        self.suffix = ["biến động", "lên xuống"]
        self.quesword = ["thế nào", "ra sao", ""]
        self.infix = ["mã chứng khoán", "mã", "cổ phiếu", "mã cổ phiếu"]
        self.balance_word = ["", "còn dư", "dư"]
        self.stock_prefix = ["", "mã", "số"]
        self.conjunction = ["", "và"]
        self.advice_prefix = ["có", "nên", "có nên"]

        self.cash_prefix = ["tài khoản"]
        self.cash_infix = ["đuôi"]

        self.check_stopword = DataCleaner()
예제 #8
0
def clean_file(input_path, output_path):
    """Limpia los datos del input creando un nuevo archivo limpio."""
    print("Comenzando limpieza...")
    dc = DataCleaner(input_path, encoding='latin1')
    custom_cleaning_before_rules(dc)
    dc.clean(RULES)
    custom_cleaning_after_rules(dc)
    y = 2015
    dc.df.hasta = pd.to_datetime(dc.df.hasta, yearfirst=True)
    dc.df.desde = pd.to_datetime(dc.df.desde, yearfirst=True)
    gii = dc.df.desde.dt.year == y
    gif = dc.df.hasta.dt.year == y
    gis = (dc.df.desde.dt.year < y) & (dc.df.hasta.dt.year > y)
    givig = gii | gif | gis
    df1 = dc.df[givig].copy()
    print("La cantida de registros 2015 es: ")
    print(givig.sum())
    gin2016 = dc.df.desde.dt.year == 2016
    df2 = dc.df[~gin2016].copy()
    print("La cantida de registros historicos es: ")
    print((~gin2016).sum())
    df1.to_csv(
        DEFAULT_OUTPUT_PATH_VIGENTE, encoding=dc.OUTPUT_ENCODING,
        separator=dc.OUTPUT_SEPARATOR,
        quotechar=dc.OUTPUT_QUOTECHAR, index=False)
    df2.to_csv(
        DEFAULT_OUTPUT_PATH1_HISTORICO, encoding=dc.OUTPUT_ENCODING,
        separator=dc.OUTPUT_SEPARATOR,
        quotechar=dc.OUTPUT_QUOTECHAR, index=False)

    print("Limpieza finalizada exitosamente!")
예제 #9
0
    def test_string_peg_split(self):
        input_path = get_input("string_separable_complejo")
        output_path = get_output("string_separable_complejo")

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        parsed_df = dc.string_peg_split(
            "solicitante", """
            allowed_char = anything:x ?(x not in '1234567890() ')
            nombre = ~('DNI') <allowed_char+>:n ws -> n.strip()
            number = <digit+>:num -> int(num)

            nom_comp = <nombre+>:nc -> nc.strip()
            cargo = '(' <nombre+>:c ')' -> c.strip()
            dni = ','? ws 'DNI' ws number:num -> num

            values = nom_comp:n ws cargo?:c ws dni?:d ws anything* -> [n, c, d]
            """, ["nombre", "cargo", "dni"])
        res_1 = nan_safe_list(parsed_df["solicitante_nombre"])
        res_2 = nan_safe_list(parsed_df["solicitante_cargo"])
        res_3 = nan_safe_list(parsed_df["solicitante_dni"])

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp_1 = nan_safe_list(df["solicitante_nombre"])
        exp_2 = nan_safe_list(df["solicitante_cargo"])
        exp_3 = nan_safe_list(df["solicitante_dni"])

        self.assertEqual(res_1, exp_1)
        self.assertEqual(res_2, exp_2)
        self.assertEqual(res_3, exp_3)
 def tokenize(self):
     data_cleaner = DataCleaner(self.corpus)
     all_word, all_sentence_split = data_cleaner.clean_content()
     print ('all_word')
     print (all_word)
     # print ('all_sentence_split')
     # print (all_sentence_split)
     return all_word, all_sentence_split
예제 #11
0
    def test_removing_line_breaks(self):
        input_path = get_input("with_line_breaks")
        output_path = get_output("with_line_breaks")

        dc = DataCleaner(input_path)
        df = pd.read_csv(output_path, encoding="utf-8")

        self.assertEqual(list(dc.df.columna), list(df.columna))
예제 #12
0
    def test_cleaning_fields(self):
        input_path = get_input("fields")
        output_path = get_output("fields")

        dc = DataCleaner(input_path)
        df = pd.read_csv(output_path, encoding="utf-8")

        self.assertEqual(set(dc.df.columns), set(df.columns))
예제 #13
0
    def test_shapefile_to_csv(self):
        output_path = BASE_DIR + '/output/localidades.csv'

        dc = DataCleaner(self.input_path)
        dc.save(output_path)

        csv_df = pd.read_csv(output_path)
        self.assertEqual(set(csv_df.columns), set(dc.df.columns))
예제 #14
0
    def test_shapefile_to_geojson(self):
        output_path = BASE_DIR + '/output/localidades.geojson'

        dc = DataCleaner(self.input_path)
        dc.save(output_path)

        geojson_df = gpd.read_file(output_path, driver='GeoJSON')
        self.assertEqual(set(geojson_df.columns), set(dc.df.columns))
예제 #15
0
    def test_remover_columnas(self):
        input_path = get_input("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        df = dc.remover_columnas(field)

        self.assertNotIn(field, df.columns)
예제 #16
0
    def test_shapefile_to_kml(self):
        output_path = BASE_DIR + '/output/localidades.kml'

        dc = DataCleaner(self.input_path)
        dc.save(output_path)

        with open(output_path) as kml_file:
            kml = kml_file.read()
            assert kml.startswith('<?xml version="1.0" encoding="utf-8" ?>')
예제 #17
0
    def test_nombre_propio_keep_original(self):
        input_path = get_input("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        dc.nombre_propio(field, keep_original=True, inplace=True)

        self.assertIn("dependencia_normalizado", dc.df.columns)
예제 #18
0
    def test_float_integrity(self):
        output_path = BASE_DIR + '/output/clean_coordinates.csv'

        dc = DataCleaner(self.input_path)
        dc.clean_file([], output_path)

        raw_input = raw_csv(self.input_path)
        raw_output = raw_csv(output_path)
        self.assertEqual(raw_input, raw_output)
def clean_file(input_path, output_path):
    """Limpia los datos del input creando un nuevo archivo limpio."""
    print("Comenzando limpieza...")
    dc = DataCleaner(input_path)
    custom_cleaning_before_rules(dc)
    dc.clean(RULES)
    custom_cleaning_after_rules(dc)
    dc.save(output_path)
    print("Limpieza finalizada exitosamente!")
def apply_rules_to_dataset(csv_input, csv_output, dataset_file_rules, parse_options):
    with warnings.catch_warnings(record=True) as catched_warnings:
        dc = DataCleaner(csv_input, **parse_options)
        dc.clean(dataset_file_rules['data-cleaner-rules'])
        dc.df.set_index(dc.df.columns[0]).to_csv(
            csv_output,
            encoding=dc.OUTPUT_ENCODING,
            sep=dc.OUTPUT_SEPARATOR,
            quotechar=dc.OUTPUT_QUOTECHAR
        )
        return catched_warnings
예제 #21
0
    def test_integration_case_1(self):
        dc = DataCleaner(get_input("integration"))
        dc.clean_file(rules, get_output("temp_integration"))

        df = pd.read_csv(get_output("temp_integration"))
        df_exp = pd.read_csv(get_output("integration"))

        self.assertEqual(set(df.columns), set(df_exp.columns))
        for col in df.columns:
            self.assertEqual(nan_safe_list(df[col]),
                             nan_safe_list(df_exp[col]))
예제 #22
0
 def test_string_regex_substitute(self):
     input_path = get_input("regex_sub")
     output_path = get_output("regex_sub")
     # obtengo el resultado de limpiar el csv
     dc = DataCleaner(input_path)
     series = dc.string_regex_substitute("lugar_audiencia", "\d+.*$", "")
     res = list(series)
     # cargo el csv limpio para comparar
     df = pd.read_csv(output_path, encoding="utf-8")
     exp = list(df["lugar_audiencia"])
     self.assertEqual(res, exp)
예제 #23
0
    def setUp(self):
        """
        Creates a new database for the unit test to use
        """
        app.config.from_pyfile('test_config.py')
        db.init_app(app)
        db.create_all()

        self.dataCleaner = DataCleaner(test_config.SQLALCHEMY_DATABASE_URI)

        self.app = app.test_client()
        return self.app
예제 #24
0
    def test_fecha_completa_keep_original(self):
        input_path = get_input("fecha_completa")
        field = "fecha_completa_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        dc.fecha_completa(field,
                          "DD-MM-YYYY HH:mm",
                          keep_original=True,
                          inplace=True)

        self.assertIn("isodatetime_fecha_completa_audiencia", dc.df.columns)
예제 #25
0
    def test_nombre_propio(self):
        input_path = get_input("nombre_propio")
        output_path = get_output("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.nombre_propio(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
예제 #26
0
    def test_string_normal(self):
        input_path = get_input("string_normal")
        output_path = get_output("string_normal")
        field = "lugar_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.string(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
예제 #27
0
    def test_fecha_simple_mes(self):
        input_path = get_input("fecha_mes")
        output_path = get_output("fecha_mes")
        field = "fecha_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.fecha_simple(field, "MM-YYYY")
        res = nan_to_empty_string_list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = nan_to_empty_string_list(df["isodate_" + field])

        self.assertEqual(res, exp)
예제 #28
0
    def test_reemplazar_string(self):
        input_path = get_input("reemplazar_string")
        output_path = get_output("reemplazar_string")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.reemplazar_string(field, {"Jaguarete": ["ABBA", "ABBBA"]})
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
예제 #29
0
    def test_mail_format(self):
        input_path = get_input("mail_format")
        output_path = get_output("mail_format")
        field = "mail"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.mail_format(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
예제 #30
0
    def test_build_data(self):
        """Construye un diccionario con unidades territoriales."""
        entity = str('localidad')
        field = str('nombre')
        test_data = {
            'localidades': [{
                'nombre': 'laferrere',
                'aplanar': True,
                'max': 1
            }]
        }

        input_path = get_input('normalize_unidad_territorial')
        dc = DataCleaner(input_path)
        data = dc._build_data(field, entity, filters={})
        self.assertEqual(data, test_data)