def main(): print "===> ETL - Profiler <===" # ---------------------> Initialize batch processing begin_time = TimeUtils.current_time_formated() global opts opts = process_args() # print '1>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS ApplicationOptions.OPTIONS = opts # print '2>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS verb = ApplicationOptions.get_option('verbose', default=False) print '[verbose]=', verb # print '3>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS database_files = [] if opts['sources_in_metadata']: print 'Processing metadata csv to get source files.' database_files = pandas.read_csv(opts['metadata_file'], header=0, index_col=None) if 'source_file' not in database_files.columns: message = 'metadata file should have a source_file column when "sources_in_metadata" option is used.' ApplicationOptions.error(message) raise Exception(message) database_files = database_files['source_file'].tolist() elif opts['cusp_dw']: print 'Generating file list based on CUSP DW folder.' database_files = CuspDW.tabular_ids_in_DW() opts['db_ids_file'] = True print 'Found ', len(database_files), ' ids.' filename = CuspDW.OUTPUT_FILE_NAME if opts['to_folder'] is not None: filename = opts['to_folder'] + filename CuspDW.save_to_id_file(database_files, filename) print 'Saved file with updated ids: ', filename # ApplicationOptions # TODO: why is ApplicationOptions here? return elif opts['file_refs'] is None and opts['db_ids_file'] is None: database_files = [opts['file']] elif opts['db_ids_file'] is not None: print 'Generation file list based on: db_ids_file.' for id in open(opts['db_ids_file']).readlines(): database_files.append(id.strip()) else: print 'Generation file list based on: file_refs' lines = open(opts['file_refs']).readlines() for file in lines: if file.strip() == '' or file.startswith('#'): continue database_files.append(file.rstrip()) print 'Databases to profile: {0}'.format(len(database_files)) profile_databases(database_files) # ---------------------> Process Profile data process_profiles() show_summaries() end_time = TimeUtils.current_time_formated() # ---------------------> Save Data save_files() print '\nBegin: ', begin_time print 'End: ', end_time print '\n\n==> The End. <=='
def __init__(self, opts=None): self.verbose = App.get_option('verbose', default=False, options=opts) self.show_details = App.get_option( 'show_details', default=False, options=opts) and self.verbose self.save_output_files = App.get_option('save_details', default=False, options=opts) self.show_all_columns = App.get_option('show_all_info', default=False, options=opts) self.silent = App.get_option('silent', default=False, options=opts) self.to_folder = App.get_option('to_folder', default=None, options=opts) self.stop_on_error = App.get_option('stop_on_error', default=False, options=opts) self.plot = App.get_option('plot', default=False, options=opts) self.skip_views = App.get_option('skip_views', default=False, options=opts) self.metadata_file = App.get_option('metadata_file', default=None, options=opts) self.part = App.get_option('part', default=False, options=opts) self.ignore_index = App.get_option('ignore_index', default=False, options=opts) self.ignore_metadata = App.get_option('ignore_metadata', default=False, options=opts) if self.to_folder is None: self.to_folder = '' self.last_zip_rows = self.last_gps_rows = None type_of_detectors = TypeDetector.data_detectors()[0] if type_of_detectors != 'Dynamic': App.warn( 'Type Detectors Not Loaded Dynamically from File. ({0} Detectors) ' .format(type_of_detectors))
def data_detectors(): # print 'aaaaaaaaaaaaaaaaaaaaaaaa' # print 'TYPES_REFERECE_FILE=', TYPES_REFERECE_FILE # print 'os.path.exists(TYPES_REFERECE_FILE)= ', os.path.exists(TYPES_REFERECE_FILE) types_file = App.get_option('types_file', default=None) if types_file and types_file.lower() == 'true': types_file = TYPES_REFERENCE_FILE if types_file and os.path.exists(types_file): global LOADED_DETECTORS if LOADED_DETECTORS is None: App.debug(' >>> Loading dynamic types from file: ', types_file) types = pandas.read_csv(types_file, header=None, skipinitialspace=True) types = types.where((pandas.notnull(types)), None) #Transform NaN into None LOADED_DETECTORS = [] for i in types.index: App.debug("") #1. Name name = types.ix[i][0] if types.ix[i][0] != types.ix[i][1]: name += '-' + types.ix[i][1] App.debug("name= ", name) #2. Regex regex_list = types.ix[i][2] App.debug("regex= ", regex_list) if type(regex_list) == str: regex_list = re.compile(types.ix[i][2]) #3 & 4. Prepare values dictionary values_dictionary = types.ix[i][3] App.debug("values_dictionary= ", values_dictionary) dictionary_is_file = types.ix[i][4] App.debug("dictionary_is_file= ", dictionary_is_file) if type(values_dictionary) == str: #is not None or Nan #Read the file into the csv if dictionary_is_file: with open( ResourceUtils.resource_path_of( values_dictionary)) as dict_file: values_dictionary = dict_file.read() #Parse string CSV into a set reader = csv.reader(values_dictionary.splitlines(), delimiter=',', skipinitialspace=True) values_dictionary = [] for row in reader: values_dictionary.extend(row) values_dictionary = set(values_dictionary) #5. Accept Nulls? accept_nulls = types.ix[i][5] App.debug("accept_nulls= ", accept_nulls) #6. Comparison type comparisson_type = types.ix[i][6] App.debug("Dictionary comparisson type= ", comparisson_type) LOADED_DETECTORS.append({ DETECTOR_NAME: name, REGEX_LIST: [regex_list], DICTIONARY: values_dictionary, ACCEPT_NULLS: accept_nulls, DICTIONARY_COMPARISON_TYPE: comparisson_type, }) App.debug('Loaded types:') for item in LOADED_DETECTORS: App.debug(item[DETECTOR_NAME]) return 'Dynamic', LOADED_DETECTORS else: #Detector must be in desired order to run return STATIC_DETECTORS, [ { DETECTOR_NAME: NULL, FUNCTION: detect_null }, { DETECTOR_NAME: GEO_ZIP, FUNCTION: detect_zip }, DETECTOR_SSN, DETECTOR_GEO_ZIP_9, DETECTOR_GEO_GPS_LAT_OR_LON, DETECTOR_GEO_GPS, DETECTOR_GEO_BOROUGH, DETECTOR_GEO_ADDRESS, # {DETECTOR_NAME: GEO_ADDRESS, FUNCTION: detect_us_address}, DETECTOR_TEMPORAL_DATE, DETECTOR_TEMPORAL_TIME, DETECTOR_TEMPORAL_DATE_TIME, DETECTOR_PHONE, DETECTOR_NUMERIC_INT, DETECTOR_NUMERIC_DOUBLE, { DETECTOR_NAME: TEXTUAL, FUNCTION: detect_text }, ]