def test_profile_real_database_erm2_nwe9_with_skip_should_profile_with_success( self): ApplicationOptions.OPTIONS = { 'silent': True, 'verbose': False, 'stop_on_error': True, 'skip_views': True } profiler = Profiler() profiler.check_if_skip_dataset = MagicMock(return_value=True) profiler.profile(ResourceUtils.get_test_resource_path('erm2-nwe9')) self.assertEqual(profiler.last_sumary.ix[0]['ETL-Profiler Status'], Profiler.MSG_SKIP_VIEW)
def test_prepare_location_columns_with_vz8c_29aj_csv_creates_gps_column( self): metadata_types = { 'Phone': 'text', 'Districts Served': 'text', 'Borough': 'text', 'Location 1': 'location' } database = pandas.read_csv( ResourceUtils.get_test_resource_path('vz8c-29aj.csv')) SocrataUtils.prepare_location_columns(database, metadata_types) database_cols = list(database.columns) self.assertIn('Location 1' + SocrataUtils.PREFIX_NEW_COLUMN + 'gps', database_cols)
def test_profile_real_database_with_socrata_metadata_vz8c_29aj_csv_has_gps_values( self): ApplicationOptions.OPTIONS = { 'silent': True, 'verbose': False, 'stop_on_error': True, 'skip_views': True } profiler = Profiler() profiler.profile(ResourceUtils.get_test_resource_path('vz8c-29aj.csv')) summary = profiler.last_sumary.ix[0] print 'summary\n', summary print 'summary\n', summary['Column Names Geo'] self.assertEqual(4, summary['Columns Geo']) self.assertGreater(summary['GPS Values'], 0)
def test_get_test_resource_path_folder_is_right(self): answer = ResourceUtils.get_test_resource_path('aAa') folder = answer.rstrip('aAa') # print 'folder= ', folder assert folder.endswith('/test/resources/')
import unittest import pandas from urban_profiler import ApplicationOptions as App from urban_profiler.utils import ResourceUtils from urban_profiler.profiler import TypeDetector DATABASE_NYPD_MVCS_PATH = ResourceUtils.get_test_resource_path( 'h9gi-nx95_SAMPLE.csv') DATABASE_311_PATH = ResourceUtils.get_test_resource_path( '311_Service_Requests_2009_SAMPLE.csv') DATABASE_2j7x_tvss_PATH = ResourceUtils.get_test_resource_path( '2j7x-tvss_SAMPLE.csv') DATABASE_26ze_s5bx_PATH = ResourceUtils.get_test_resource_path('26ze-s5bx.csv') class TypeDetectorTests(unittest.TestCase): @classmethod def setUpClass(cls): cls.database_NYPD_MVC = pandas.read_csv(DATABASE_NYPD_MVCS_PATH) cls.database_311 = pandas.read_csv(DATABASE_311_PATH) cls.database_2j7x_tvss = pandas.read_csv(DATABASE_2j7x_tvss_PATH) cls.database_26ze_s5bx = pandas.read_csv(DATABASE_26ze_s5bx_PATH) def setUp(self): App.stop_debuging() # print '[TEST:', self._testMethodName, ']' self.database_NYPD_MVC = TypeDetectorTests.database_NYPD_MVC self.database_311 = TypeDetectorTests.database_311 self.database_2j7x_tvss = TypeDetectorTests.database_2j7x_tvss self.database_26ze_s5bx = TypeDetectorTests.database_26ze_s5bx
def test_regressions_DB_2bh6_qmgg_Mean_Scale_Score_has_no_zip_codes(self): # App.start_debuging() col_data = pandas.read_csv(ResourceUtils.get_test_resource_path('2bh6-qmgg'))['Mean Scale Score'] detected = TypeDetector.detect_zip(col_data)[1] self.assertEqual(54, len(detected))
def test_regressions_DB_2bh6_qmgg_Mean_Scale_Score_is_Geo_Zip_but_should_be_Numeric_Integer(self): col_data = pandas.read_csv(ResourceUtils.get_test_resource_path('2bh6-qmgg'))['Mean Scale Score'] detected_type = TypeDetector.most_detected(TypeDetector.types_of(col_data))[0] self.assertEqual(TypeDetector.GEO_ZIP, detected_type)
# To change this license header, choose License Headers in Project Properties. # To change this template file, choose Tools | Templates # and open the template in the editor. import unittest from urban_profiler.utils import ResourceUtils from urban_profiler.profiler.Profiler import Profiler from urban_profiler import ApplicationOptions from mock import MagicMock DATABASE_h9gi_nx95_SAMPLE_NO_EXTENSION_PATH = ResourceUtils.get_test_resource_path( "h9gi-nx95_SAMPLE") ########################################### ## NYPD Vehicle Motor Colision is h9gi-nx95 ########################################### class Profiler_TestCase_NYPD(unittest.TestCase): summary_h9gi_nx95 = None summary_h9gi_nx95_SAMPLE = None def setUp(self): # print '[TEST:', self._testMethodName, ']' if Profiler_TestCase_NYPD.summary_h9gi_nx95_SAMPLE is None: verbose = False ApplicationOptions.OPTIONS = { 'silent': not verbose, 'verbose': verbose, 'stop_on_error': True, 'show_details': True
def test_get_resource_path_folder_is_right(self): answer = ResourceUtils.resource_path_of('aAa') folder = answer.rstrip('aAa') # print 'folder= ', folder assert folder.endswith('/urban_profiler/resources/')
import unittest import pandas from urban_profiler.utils import ResourceUtils from urban_profiler.utils import PandasUtils as PandasUtils DATABASE_NYPD_MVCS_PATH = ResourceUtils.get_test_resource_path( 'h9gi-nx95_SAMPLE.csv') DATABASE_2nju_4jd4_PATH = ResourceUtils.get_test_resource_path( '2nju-4jd4.json') class PandaUtils_Tests(unittest.TestCase): @classmethod def setUpClass(cls): PandaUtils_Tests.database_NYPD_MVC = pandas.read_csv( DATABASE_NYPD_MVCS_PATH) def setUp(self): # print '[TEST:', self._testMethodName, ']' self.database_NYPD_MVC = PandaUtils_Tests.database_NYPD_MVC # ========================================================================== load_database() def test_load_database_with_csv(self): assert PandasUtils.load_database(DATABASE_NYPD_MVCS_PATH) is not None def test_load_json_with_data_and_metadata_and_subcolumns(self): data_frame = PandasUtils.load_database(DATABASE_2nju_4jd4_PATH) self.assertEqual(24, len(data_frame.columns)) def test_load_with_file_not_found_should_raise_error(self):
def data_detectors(): # print 'aaaaaaaaaaaaaaaaaaaaaaaa' # print 'TYPES_REFERECE_FILE=', TYPES_REFERECE_FILE # print 'os.path.exists(TYPES_REFERECE_FILE)= ', os.path.exists(TYPES_REFERECE_FILE) types_file = App.get_option('types_file', default=None) if types_file and types_file.lower() == 'true': types_file = TYPES_REFERENCE_FILE if types_file and os.path.exists(types_file): global LOADED_DETECTORS if LOADED_DETECTORS is None: App.debug(' >>> Loading dynamic types from file: ', types_file) types = pandas.read_csv(types_file, header=None, skipinitialspace=True) types = types.where((pandas.notnull(types)), None) #Transform NaN into None LOADED_DETECTORS = [] for i in types.index: App.debug("") #1. Name name = types.ix[i][0] if types.ix[i][0] != types.ix[i][1]: name += '-' + types.ix[i][1] App.debug("name= ", name) #2. Regex regex_list = types.ix[i][2] App.debug("regex= ", regex_list) if type(regex_list) == str: regex_list = re.compile(types.ix[i][2]) #3 & 4. Prepare values dictionary values_dictionary = types.ix[i][3] App.debug("values_dictionary= ", values_dictionary) dictionary_is_file = types.ix[i][4] App.debug("dictionary_is_file= ", dictionary_is_file) if type(values_dictionary) == str: #is not None or Nan #Read the file into the csv if dictionary_is_file: with open( ResourceUtils.resource_path_of( values_dictionary)) as dict_file: values_dictionary = dict_file.read() #Parse string CSV into a set reader = csv.reader(values_dictionary.splitlines(), delimiter=',', skipinitialspace=True) values_dictionary = [] for row in reader: values_dictionary.extend(row) values_dictionary = set(values_dictionary) #5. Accept Nulls? accept_nulls = types.ix[i][5] App.debug("accept_nulls= ", accept_nulls) #6. Comparison type comparisson_type = types.ix[i][6] App.debug("Dictionary comparisson type= ", comparisson_type) LOADED_DETECTORS.append({ DETECTOR_NAME: name, REGEX_LIST: [regex_list], DICTIONARY: values_dictionary, ACCEPT_NULLS: accept_nulls, DICTIONARY_COMPARISON_TYPE: comparisson_type, }) App.debug('Loaded types:') for item in LOADED_DETECTORS: App.debug(item[DETECTOR_NAME]) return 'Dynamic', LOADED_DETECTORS else: #Detector must be in desired order to run return STATIC_DETECTORS, [ { DETECTOR_NAME: NULL, FUNCTION: detect_null }, { DETECTOR_NAME: GEO_ZIP, FUNCTION: detect_zip }, DETECTOR_SSN, DETECTOR_GEO_ZIP_9, DETECTOR_GEO_GPS_LAT_OR_LON, DETECTOR_GEO_GPS, DETECTOR_GEO_BOROUGH, DETECTOR_GEO_ADDRESS, # {DETECTOR_NAME: GEO_ADDRESS, FUNCTION: detect_us_address}, DETECTOR_TEMPORAL_DATE, DETECTOR_TEMPORAL_TIME, DETECTOR_TEMPORAL_DATE_TIME, DETECTOR_PHONE, DETECTOR_NUMERIC_INT, DETECTOR_NUMERIC_DOUBLE, { DETECTOR_NAME: TEXTUAL, FUNCTION: detect_text }, ]
import pandas import re import numpy from urban_profiler import ApplicationOptions as App from types import FunctionType as function from types import DictType as dict import operator from urban_profiler.utils import ResourceUtils from urban_profiler.utils import TextUtils from os.path import expanduser import csv import os import usaddress from urban_profiler import ApplicationConstants as Constants TYPES_REFERENCE_FILE = ResourceUtils.resource_path_of('types_to_detect.csv') DEBUG = True PERCENTUAL_PRECISION = 3 NULL_VALUES = ['nan', 'none', 'n/a', 'null'] STATIC_DETECTORS = 'Static' DYNAMIC_DETECTORS = 'Dynamic' # Types - Prefixes NUMERIC = 'Numeric' TEXTUAL = 'Textual' GEO = 'Geo' TEMPORAL = 'Temporal' NULL = 'Null' TYPE_PREFIXES = [NULL, TEXTUAL, NUMERIC, GEO, TEMPORAL]
def test_profile_real_database_with_socrata_metadata_vz8c_29aj_csv(self): self.profiler.profile( ResourceUtils.get_test_resource_path('vz8c-29aj.csv')) self.assertEqual( 'OK', self.profiler.last_sumary.ix[0]['ETL-Profiler Status'])
def test_get_test_resource_path_folder_has_test_path(self): answer = ResourceUtils.get_test_resource_path('aAa') folder = answer.rstrip('aAa') # print 'folder= ', folder assert TEST_PATH in answer
# To change this license header, choose License Headers in Project Properties. # To change this template file, choose Tools | Templates # and open the template in the editor. import unittest import sys sys.path.append("/urban_profiler/plot") from urban_profiler.utils import ResourceUtils from urban_profiler import Main from urban_profiler.utils import CLI as CLI from urban_profiler import ApplicationOptions LIST_DATABASES_FAST_PATH = ResourceUtils.get_test_resource_path( 'open_data_test_list_fast.csv') LIST_DATABASES_FAST_WITH_ERROR_PATH = ResourceUtils.get_test_resource_path( 'open_data_test_list_fast_with_Error.csv') class Profile_Multiple_Datasets(unittest.TestCase): @classmethod def setUpClass(cls): ApplicationOptions.OPTIONS = {'silent': True, 'stop_on_error': True} # def setUp(self): # print '[TEST:', self._testMethodName, ']' # TODO: refactor those tests and uncomment them. # def test_profile_dataset_list_with_error(self): # CLI.ARGS = ['--silent', '--stop_on_error', '--to_folder=/tmp', '--file=a'] # self.assertRaises(Exception, Main.main)
def test_get_resource_path_file_is_right(self): answer = ResourceUtils.resource_path_of('aAa') # print 'answer= ', answer assert answer.endswith('aAa')
import unittest import pandas from urban_profiler import ApplicationOptions as App from urban_profiler.utils import ResourceUtils from urban_profiler.profiler import TypeDetector DATABASE_h9gi_nx95_PATH = ResourceUtils.get_test_resource_path('h9gi-nx95_SAMPLE.csv') class TypeDetectorTests(unittest.TestCase): @classmethod def setUpClass(cls): cls.database_h9gi_nx95 = pandas.read_csv(DATABASE_h9gi_nx95_PATH) def setUp(self): App.stop_debuging() # print '[TEST:', self._testMethodName, ']' self.database_h9gi_nx95 = TypeDetectorTests.database_h9gi_nx95 self.type_counts = { TypeDetector.GEO_GPS: 0, TypeDetector.GEO_ZIP: 0, TypeDetector.GEO_BOROUGH: 0, TypeDetector.TEMPORAL_DATE: 0, TypeDetector.TEMPORAL_TIME: 0, TypeDetector.TEMPORAL_DATE_TIME: 0, TypeDetector.NUMERIC_INT: 0, TypeDetector.NUMERIC_DOUBLE: 0, TypeDetector.TEXTUAL: 0, TypeDetector.NULL: 0} # --------------------------------------------------------------------------- Regression Tests # def test_regressions_DB_h9gi_nx95_CONTRIBUTING_FACTOR_VEHICLE_1_IS_TEXT(self): # col_data = self.database_h9gi_nx95['CONTRIBUTING FACTOR VEHICLE 1'] # detected_type = TypeDetector.type_of_column_data(col_data)
def test_get_resource_path_folder_dont_have_test_path(self): answer = ResourceUtils.resource_path_of('aAa') folder = answer.rstrip('aAa') # print 'folder= ', folder assert TEST_PATH not in answer
import pandas from urban_profiler.utils import ResourceUtils from urban_profiler.profiler import TypeDetector from urban_profiler import ApplicationConstants as Constants GEO_INDEX_COLUMNS = ['address', 'lat', 'lon', 'zipcode', 'borough'] NY_GEO_DATA_FILE = ResourceUtils.resource_path_of('nyc_pluto_prepared.csv') # bounding boxes of each zipcode according to NYC PLUTO ZIPCODES_LAT_LON_MIN_MAX = pandas.read_csv( ResourceUtils.resource_path_of('zipcode_lat_lon.csv'), dtype=str) ZIPCODE_IMPROVEMENTS_WITH_IT = ['zipcode', 'borough'] ADDRESS_IMPROVEMENTS_WITH_IT = ['lat', 'lon', 'address', 'zipcode', 'borough'] GPS_IMPROVEMENTS_WITH_IT = ['lat', 'lon', 'address', 'zipcode', 'borough'] NY_GEO_DATA = None def get_ny_geo_data(): global NY_GEO_DATA if NY_GEO_DATA is None: print '(Loading NYC Pluto Index...)' NY_GEO_DATA = pandas.read_csv(NY_GEO_DATA_FILE)[GEO_INDEX_COLUMNS] # NY_GEO_DATA['gps'] = NY_GEO_DATA.lat.astype(str) + ',' + NY_GEO_DATA.lon.astype(str) return NY_GEO_DATA def improve_zipcode_data(zipcode_data): # 1. get NY data higher than zipcode nyc_data = get_ny_geo_data()[ZIPCODE_IMPROVEMENTS_WITH_IT] nyc_data = nyc_data[pandas.notnull(nyc_data.zipcode)]