예제 #1
0
 def test_profile_real_database_erm2_nwe9_with_skip_should_profile_with_success(
         self):
     ApplicationOptions.OPTIONS = {
         'silent': True,
         'verbose': False,
         'stop_on_error': True,
         'skip_views': True
     }
     profiler = Profiler()
     profiler.check_if_skip_dataset = MagicMock(return_value=True)
     profiler.profile(ResourceUtils.get_test_resource_path('erm2-nwe9'))
     self.assertEqual(profiler.last_sumary.ix[0]['ETL-Profiler Status'],
                      Profiler.MSG_SKIP_VIEW)
    def test_prepare_location_columns_with_vz8c_29aj_csv_creates_gps_column(
            self):
        metadata_types = {
            'Phone': 'text',
            'Districts Served': 'text',
            'Borough': 'text',
            'Location 1': 'location'
        }
        database = pandas.read_csv(
            ResourceUtils.get_test_resource_path('vz8c-29aj.csv'))

        SocrataUtils.prepare_location_columns(database, metadata_types)
        database_cols = list(database.columns)

        self.assertIn('Location 1' + SocrataUtils.PREFIX_NEW_COLUMN + 'gps',
                      database_cols)
예제 #3
0
    def test_profile_real_database_with_socrata_metadata_vz8c_29aj_csv_has_gps_values(
            self):
        ApplicationOptions.OPTIONS = {
            'silent': True,
            'verbose': False,
            'stop_on_error': True,
            'skip_views': True
        }
        profiler = Profiler()
        profiler.profile(ResourceUtils.get_test_resource_path('vz8c-29aj.csv'))
        summary = profiler.last_sumary.ix[0]
        print 'summary\n', summary
        print 'summary\n', summary['Column Names Geo']

        self.assertEqual(4, summary['Columns Geo'])
        self.assertGreater(summary['GPS Values'], 0)
예제 #4
0
 def test_get_test_resource_path_folder_is_right(self):
     answer = ResourceUtils.get_test_resource_path('aAa')
     folder = answer.rstrip('aAa')
     # print 'folder= ', folder
     assert folder.endswith('/test/resources/')
import unittest
import pandas

from urban_profiler import ApplicationOptions as App
from urban_profiler.utils import ResourceUtils
from urban_profiler.profiler import TypeDetector

DATABASE_NYPD_MVCS_PATH = ResourceUtils.get_test_resource_path(
    'h9gi-nx95_SAMPLE.csv')
DATABASE_311_PATH = ResourceUtils.get_test_resource_path(
    '311_Service_Requests_2009_SAMPLE.csv')
DATABASE_2j7x_tvss_PATH = ResourceUtils.get_test_resource_path(
    '2j7x-tvss_SAMPLE.csv')
DATABASE_26ze_s5bx_PATH = ResourceUtils.get_test_resource_path('26ze-s5bx.csv')


class TypeDetectorTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.database_NYPD_MVC = pandas.read_csv(DATABASE_NYPD_MVCS_PATH)
        cls.database_311 = pandas.read_csv(DATABASE_311_PATH)
        cls.database_2j7x_tvss = pandas.read_csv(DATABASE_2j7x_tvss_PATH)
        cls.database_26ze_s5bx = pandas.read_csv(DATABASE_26ze_s5bx_PATH)

    def setUp(self):
        App.stop_debuging()
        # print '[TEST:', self._testMethodName, ']'
        self.database_NYPD_MVC = TypeDetectorTests.database_NYPD_MVC
        self.database_311 = TypeDetectorTests.database_311
        self.database_2j7x_tvss = TypeDetectorTests.database_2j7x_tvss
        self.database_26ze_s5bx = TypeDetectorTests.database_26ze_s5bx
예제 #6
0
	def test_regressions_DB_2bh6_qmgg_Mean_Scale_Score_has_no_zip_codes(self):
		# App.start_debuging()
		col_data = pandas.read_csv(ResourceUtils.get_test_resource_path('2bh6-qmgg'))['Mean Scale Score']
		detected = TypeDetector.detect_zip(col_data)[1]
		self.assertEqual(54, len(detected))
예제 #7
0
	def test_regressions_DB_2bh6_qmgg_Mean_Scale_Score_is_Geo_Zip_but_should_be_Numeric_Integer(self):
		col_data = pandas.read_csv(ResourceUtils.get_test_resource_path('2bh6-qmgg'))['Mean Scale Score']
		detected_type = TypeDetector.most_detected(TypeDetector.types_of(col_data))[0]
		self.assertEqual(TypeDetector.GEO_ZIP, detected_type)
예제 #8
0
# To change this license header, choose License Headers in Project Properties.
# To change this template file, choose Tools | Templates
# and open the template in the editor.

import unittest
from urban_profiler.utils import ResourceUtils
from urban_profiler.profiler.Profiler import Profiler
from urban_profiler import ApplicationOptions
from mock import MagicMock

DATABASE_h9gi_nx95_SAMPLE_NO_EXTENSION_PATH = ResourceUtils.get_test_resource_path(
    "h9gi-nx95_SAMPLE")
###########################################
## NYPD Vehicle Motor Colision is h9gi-nx95
###########################################


class Profiler_TestCase_NYPD(unittest.TestCase):

    summary_h9gi_nx95 = None
    summary_h9gi_nx95_SAMPLE = None

    def setUp(self):
        # print '[TEST:', self._testMethodName, ']'
        if Profiler_TestCase_NYPD.summary_h9gi_nx95_SAMPLE is None:
            verbose = False
            ApplicationOptions.OPTIONS = {
                'silent': not verbose,
                'verbose': verbose,
                'stop_on_error': True,
                'show_details': True
예제 #9
0
 def test_get_resource_path_folder_is_right(self):
     answer = ResourceUtils.resource_path_of('aAa')
     folder = answer.rstrip('aAa')
     # print 'folder= ', folder
     assert folder.endswith('/urban_profiler/resources/')
import unittest
import pandas

from urban_profiler.utils import ResourceUtils
from urban_profiler.utils import PandasUtils as PandasUtils

DATABASE_NYPD_MVCS_PATH = ResourceUtils.get_test_resource_path(
    'h9gi-nx95_SAMPLE.csv')
DATABASE_2nju_4jd4_PATH = ResourceUtils.get_test_resource_path(
    '2nju-4jd4.json')


class PandaUtils_Tests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        PandaUtils_Tests.database_NYPD_MVC = pandas.read_csv(
            DATABASE_NYPD_MVCS_PATH)

    def setUp(self):
        # print '[TEST:', self._testMethodName, ']'
        self.database_NYPD_MVC = PandaUtils_Tests.database_NYPD_MVC

    # ========================================================================== load_database()
    def test_load_database_with_csv(self):
        assert PandasUtils.load_database(DATABASE_NYPD_MVCS_PATH) is not None

    def test_load_json_with_data_and_metadata_and_subcolumns(self):
        data_frame = PandasUtils.load_database(DATABASE_2nju_4jd4_PATH)
        self.assertEqual(24, len(data_frame.columns))

    def test_load_with_file_not_found_should_raise_error(self):
예제 #11
0
def data_detectors():
    #    print 'aaaaaaaaaaaaaaaaaaaaaaaa'
    #    print 'TYPES_REFERECE_FILE=', TYPES_REFERECE_FILE
    #    print 'os.path.exists(TYPES_REFERECE_FILE)= ', os.path.exists(TYPES_REFERECE_FILE)

    types_file = App.get_option('types_file', default=None)
    if types_file and types_file.lower() == 'true':
        types_file = TYPES_REFERENCE_FILE
    if types_file and os.path.exists(types_file):
        global LOADED_DETECTORS
        if LOADED_DETECTORS is None:
            App.debug(' >>> Loading dynamic types from file: ', types_file)
            types = pandas.read_csv(types_file,
                                    header=None,
                                    skipinitialspace=True)
            types = types.where((pandas.notnull(types)),
                                None)  #Transform NaN into None
            LOADED_DETECTORS = []
            for i in types.index:
                App.debug("")
                #1. Name
                name = types.ix[i][0]
                if types.ix[i][0] != types.ix[i][1]:
                    name += '-' + types.ix[i][1]
                App.debug("name= ", name)
                #2. Regex
                regex_list = types.ix[i][2]
                App.debug("regex= ", regex_list)
                if type(regex_list) == str:
                    regex_list = re.compile(types.ix[i][2])
                #3 & 4. Prepare values dictionary
                values_dictionary = types.ix[i][3]
                App.debug("values_dictionary= ", values_dictionary)

                dictionary_is_file = types.ix[i][4]
                App.debug("dictionary_is_file= ", dictionary_is_file)

                if type(values_dictionary) == str:  #is not None or Nan
                    #Read the file into the csv
                    if dictionary_is_file:
                        with open(
                                ResourceUtils.resource_path_of(
                                    values_dictionary)) as dict_file:
                            values_dictionary = dict_file.read()

                    #Parse string CSV into a set
                    reader = csv.reader(values_dictionary.splitlines(),
                                        delimiter=',',
                                        skipinitialspace=True)
                    values_dictionary = []
                    for row in reader:
                        values_dictionary.extend(row)
                    values_dictionary = set(values_dictionary)

                #5. Accept Nulls?
                accept_nulls = types.ix[i][5]
                App.debug("accept_nulls= ", accept_nulls)
                #6. Comparison type
                comparisson_type = types.ix[i][6]
                App.debug("Dictionary comparisson type= ", comparisson_type)

                LOADED_DETECTORS.append({
                    DETECTOR_NAME:
                    name,
                    REGEX_LIST: [regex_list],
                    DICTIONARY:
                    values_dictionary,
                    ACCEPT_NULLS:
                    accept_nulls,
                    DICTIONARY_COMPARISON_TYPE:
                    comparisson_type,
                })
            App.debug('Loaded types:')
            for item in LOADED_DETECTORS:
                App.debug(item[DETECTOR_NAME])
        return 'Dynamic', LOADED_DETECTORS

    else:
        #Detector must be in desired order to run
        return STATIC_DETECTORS, [
            {
                DETECTOR_NAME: NULL,
                FUNCTION: detect_null
            },
            {
                DETECTOR_NAME: GEO_ZIP,
                FUNCTION: detect_zip
            },
            DETECTOR_SSN,
            DETECTOR_GEO_ZIP_9,
            DETECTOR_GEO_GPS_LAT_OR_LON,
            DETECTOR_GEO_GPS,
            DETECTOR_GEO_BOROUGH,
            DETECTOR_GEO_ADDRESS,
            # {DETECTOR_NAME: GEO_ADDRESS, FUNCTION: detect_us_address},
            DETECTOR_TEMPORAL_DATE,
            DETECTOR_TEMPORAL_TIME,
            DETECTOR_TEMPORAL_DATE_TIME,
            DETECTOR_PHONE,
            DETECTOR_NUMERIC_INT,
            DETECTOR_NUMERIC_DOUBLE,
            {
                DETECTOR_NAME: TEXTUAL,
                FUNCTION: detect_text
            },
        ]
예제 #12
0
import pandas
import re
import numpy
from urban_profiler import ApplicationOptions as App
from types import FunctionType as function
from types import DictType as dict
import operator
from urban_profiler.utils import ResourceUtils
from urban_profiler.utils import TextUtils
from os.path import expanduser
import csv
import os
import usaddress
from urban_profiler import ApplicationConstants as Constants

TYPES_REFERENCE_FILE = ResourceUtils.resource_path_of('types_to_detect.csv')

DEBUG = True
PERCENTUAL_PRECISION = 3
NULL_VALUES = ['nan', 'none', 'n/a', 'null']

STATIC_DETECTORS = 'Static'
DYNAMIC_DETECTORS = 'Dynamic'

# Types - Prefixes
NUMERIC = 'Numeric'
TEXTUAL = 'Textual'
GEO = 'Geo'
TEMPORAL = 'Temporal'
NULL = 'Null'
TYPE_PREFIXES = [NULL, TEXTUAL, NUMERIC, GEO, TEMPORAL]
예제 #13
0
 def test_profile_real_database_with_socrata_metadata_vz8c_29aj_csv(self):
     self.profiler.profile(
         ResourceUtils.get_test_resource_path('vz8c-29aj.csv'))
     self.assertEqual(
         'OK', self.profiler.last_sumary.ix[0]['ETL-Profiler Status'])
예제 #14
0
 def test_get_test_resource_path_folder_has_test_path(self):
     answer = ResourceUtils.get_test_resource_path('aAa')
     folder = answer.rstrip('aAa')
     # print 'folder= ', folder
     assert TEST_PATH in answer
예제 #15
0
# To change this license header, choose License Headers in Project Properties.
# To change this template file, choose Tools | Templates
# and open the template in the editor.
import unittest

import sys
sys.path.append("/urban_profiler/plot")

from urban_profiler.utils import ResourceUtils
from urban_profiler import Main
from urban_profiler.utils import CLI as CLI
from urban_profiler import ApplicationOptions

LIST_DATABASES_FAST_PATH = ResourceUtils.get_test_resource_path(
    'open_data_test_list_fast.csv')
LIST_DATABASES_FAST_WITH_ERROR_PATH = ResourceUtils.get_test_resource_path(
    'open_data_test_list_fast_with_Error.csv')


class Profile_Multiple_Datasets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        ApplicationOptions.OPTIONS = {'silent': True, 'stop_on_error': True}

    # def setUp(self):
    # print '[TEST:', self._testMethodName, ']'

    # TODO: refactor those tests and uncomment them.
    # def test_profile_dataset_list_with_error(self):
    #     CLI.ARGS = ['--silent', '--stop_on_error', '--to_folder=/tmp', '--file=a']
    #     self.assertRaises(Exception, Main.main)
예제 #16
0
 def test_get_resource_path_file_is_right(self):
     answer = ResourceUtils.resource_path_of('aAa')
     # print 'answer= ', answer
     assert answer.endswith('aAa')
예제 #17
0
import unittest
import pandas

from urban_profiler import ApplicationOptions as App
from urban_profiler.utils import ResourceUtils
from urban_profiler.profiler import TypeDetector

DATABASE_h9gi_nx95_PATH = ResourceUtils.get_test_resource_path('h9gi-nx95_SAMPLE.csv')


class TypeDetectorTests(unittest.TestCase):
	@classmethod
	def setUpClass(cls):
		cls.database_h9gi_nx95 = pandas.read_csv(DATABASE_h9gi_nx95_PATH)

	def setUp(self):
		App.stop_debuging()
		# print '[TEST:', self._testMethodName, ']'
		self.database_h9gi_nx95 = TypeDetectorTests.database_h9gi_nx95

		self.type_counts = {
			TypeDetector.GEO_GPS: 0, TypeDetector.GEO_ZIP: 0, TypeDetector.GEO_BOROUGH: 0,
			TypeDetector.TEMPORAL_DATE: 0, TypeDetector.TEMPORAL_TIME: 0, TypeDetector.TEMPORAL_DATE_TIME: 0,
			TypeDetector.NUMERIC_INT: 0, TypeDetector.NUMERIC_DOUBLE: 0,
			TypeDetector.TEXTUAL: 0,
			TypeDetector.NULL: 0}

	# --------------------------------------------------------------------------- Regression Tests
	# def test_regressions_DB_h9gi_nx95_CONTRIBUTING_FACTOR_VEHICLE_1_IS_TEXT(self):
	# 	col_data = self.database_h9gi_nx95['CONTRIBUTING FACTOR VEHICLE 1']
	# 	detected_type = TypeDetector.type_of_column_data(col_data)
예제 #18
0
 def test_get_resource_path_folder_dont_have_test_path(self):
     answer = ResourceUtils.resource_path_of('aAa')
     folder = answer.rstrip('aAa')
     # print 'folder= ', folder
     assert TEST_PATH not in answer
예제 #19
0
import pandas
from urban_profiler.utils import ResourceUtils
from urban_profiler.profiler import TypeDetector
from urban_profiler import ApplicationConstants as Constants

GEO_INDEX_COLUMNS = ['address', 'lat', 'lon', 'zipcode', 'borough']
NY_GEO_DATA_FILE = ResourceUtils.resource_path_of('nyc_pluto_prepared.csv')
# bounding boxes of each zipcode according to NYC PLUTO
ZIPCODES_LAT_LON_MIN_MAX = pandas.read_csv(
    ResourceUtils.resource_path_of('zipcode_lat_lon.csv'), dtype=str)

ZIPCODE_IMPROVEMENTS_WITH_IT = ['zipcode', 'borough']
ADDRESS_IMPROVEMENTS_WITH_IT = ['lat', 'lon', 'address', 'zipcode', 'borough']
GPS_IMPROVEMENTS_WITH_IT = ['lat', 'lon', 'address', 'zipcode', 'borough']

NY_GEO_DATA = None


def get_ny_geo_data():
    global NY_GEO_DATA
    if NY_GEO_DATA is None:
        print '(Loading NYC Pluto Index...)'
        NY_GEO_DATA = pandas.read_csv(NY_GEO_DATA_FILE)[GEO_INDEX_COLUMNS]
        # NY_GEO_DATA['gps'] = NY_GEO_DATA.lat.astype(str) + ',' + NY_GEO_DATA.lon.astype(str)
    return NY_GEO_DATA


def improve_zipcode_data(zipcode_data):
    # 1. get NY data higher than zipcode
    nyc_data = get_ny_geo_data()[ZIPCODE_IMPROVEMENTS_WITH_IT]
    nyc_data = nyc_data[pandas.notnull(nyc_data.zipcode)]