예제 #1
0
def main():
    print "===> ETL - Profiler <==="

    # ---------------------> Initialize batch processing
    begin_time = TimeUtils.current_time_formated()
    global opts
    opts = process_args()

    # print '1>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS
    ApplicationOptions.OPTIONS = opts
    # print '2>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS
    verb = ApplicationOptions.get_option('verbose', default=False)
    print '[verbose]=', verb
    # print '3>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS

    database_files = []

    if opts['sources_in_metadata']:
        print 'Processing metadata csv to get source files.'
        database_files = pandas.read_csv(opts['metadata_file'],
                                         header=0,
                                         index_col=None)
        if 'source_file' not in database_files.columns:
            message = 'metadata file should have a source_file column when "sources_in_metadata" option is used.'
            ApplicationOptions.error(message)
            raise Exception(message)

        database_files = database_files['source_file'].tolist()

    elif opts['cusp_dw']:
        print 'Generating file list based on CUSP DW folder.'
        database_files = CuspDW.tabular_ids_in_DW()
        opts['db_ids_file'] = True
        print 'Found ', len(database_files), ' ids.'
        filename = CuspDW.OUTPUT_FILE_NAME
        if opts['to_folder'] is not None:
            filename = opts['to_folder'] + filename
        CuspDW.save_to_id_file(database_files, filename)
        print 'Saved file with updated ids: ', filename  # ApplicationOptions # TODO: why is ApplicationOptions here?
        return

    elif opts['file_refs'] is None and opts['db_ids_file'] is None:
        database_files = [opts['file']]

    elif opts['db_ids_file'] is not None:
        print 'Generation file list based on: db_ids_file.'
        for id in open(opts['db_ids_file']).readlines():
            database_files.append(id.strip())

    else:
        print 'Generation file list based on: file_refs'
        lines = open(opts['file_refs']).readlines()

        for file in lines:
            if file.strip() == '' or file.startswith('#'): continue
            database_files.append(file.rstrip())

        print 'Databases to profile: {0}'.format(len(database_files))

    profile_databases(database_files)

    # ---------------------> Process Profile data
    process_profiles()
    show_summaries()

    end_time = TimeUtils.current_time_formated()

    # ---------------------> Save Data
    save_files()

    print '\nBegin: ', begin_time
    print 'End: ', end_time

    print '\n\n==> The End. <=='
예제 #2
0
    def __init__(self, opts=None):
        self.verbose = App.get_option('verbose', default=False, options=opts)
        self.show_details = App.get_option(
            'show_details', default=False, options=opts) and self.verbose
        self.save_output_files = App.get_option('save_details',
                                                default=False,
                                                options=opts)
        self.show_all_columns = App.get_option('show_all_info',
                                               default=False,
                                               options=opts)
        self.silent = App.get_option('silent', default=False, options=opts)
        self.to_folder = App.get_option('to_folder',
                                        default=None,
                                        options=opts)
        self.stop_on_error = App.get_option('stop_on_error',
                                            default=False,
                                            options=opts)
        self.plot = App.get_option('plot', default=False, options=opts)
        self.skip_views = App.get_option('skip_views',
                                         default=False,
                                         options=opts)
        self.metadata_file = App.get_option('metadata_file',
                                            default=None,
                                            options=opts)
        self.part = App.get_option('part', default=False, options=opts)
        self.ignore_index = App.get_option('ignore_index',
                                           default=False,
                                           options=opts)
        self.ignore_metadata = App.get_option('ignore_metadata',
                                              default=False,
                                              options=opts)

        if self.to_folder is None: self.to_folder = ''

        self.last_zip_rows = self.last_gps_rows = None

        type_of_detectors = TypeDetector.data_detectors()[0]
        if type_of_detectors != 'Dynamic':
            App.warn(
                'Type Detectors Not Loaded Dynamically from File. ({0} Detectors) '
                .format(type_of_detectors))
예제 #3
0
def data_detectors():
    #    print 'aaaaaaaaaaaaaaaaaaaaaaaa'
    #    print 'TYPES_REFERECE_FILE=', TYPES_REFERECE_FILE
    #    print 'os.path.exists(TYPES_REFERECE_FILE)= ', os.path.exists(TYPES_REFERECE_FILE)

    types_file = App.get_option('types_file', default=None)
    if types_file and types_file.lower() == 'true':
        types_file = TYPES_REFERENCE_FILE
    if types_file and os.path.exists(types_file):
        global LOADED_DETECTORS
        if LOADED_DETECTORS is None:
            App.debug(' >>> Loading dynamic types from file: ', types_file)
            types = pandas.read_csv(types_file,
                                    header=None,
                                    skipinitialspace=True)
            types = types.where((pandas.notnull(types)),
                                None)  #Transform NaN into None
            LOADED_DETECTORS = []
            for i in types.index:
                App.debug("")
                #1. Name
                name = types.ix[i][0]
                if types.ix[i][0] != types.ix[i][1]:
                    name += '-' + types.ix[i][1]
                App.debug("name= ", name)
                #2. Regex
                regex_list = types.ix[i][2]
                App.debug("regex= ", regex_list)
                if type(regex_list) == str:
                    regex_list = re.compile(types.ix[i][2])
                #3 & 4. Prepare values dictionary
                values_dictionary = types.ix[i][3]
                App.debug("values_dictionary= ", values_dictionary)

                dictionary_is_file = types.ix[i][4]
                App.debug("dictionary_is_file= ", dictionary_is_file)

                if type(values_dictionary) == str:  #is not None or Nan
                    #Read the file into the csv
                    if dictionary_is_file:
                        with open(
                                ResourceUtils.resource_path_of(
                                    values_dictionary)) as dict_file:
                            values_dictionary = dict_file.read()

                    #Parse string CSV into a set
                    reader = csv.reader(values_dictionary.splitlines(),
                                        delimiter=',',
                                        skipinitialspace=True)
                    values_dictionary = []
                    for row in reader:
                        values_dictionary.extend(row)
                    values_dictionary = set(values_dictionary)

                #5. Accept Nulls?
                accept_nulls = types.ix[i][5]
                App.debug("accept_nulls= ", accept_nulls)
                #6. Comparison type
                comparisson_type = types.ix[i][6]
                App.debug("Dictionary comparisson type= ", comparisson_type)

                LOADED_DETECTORS.append({
                    DETECTOR_NAME:
                    name,
                    REGEX_LIST: [regex_list],
                    DICTIONARY:
                    values_dictionary,
                    ACCEPT_NULLS:
                    accept_nulls,
                    DICTIONARY_COMPARISON_TYPE:
                    comparisson_type,
                })
            App.debug('Loaded types:')
            for item in LOADED_DETECTORS:
                App.debug(item[DETECTOR_NAME])
        return 'Dynamic', LOADED_DETECTORS

    else:
        #Detector must be in desired order to run
        return STATIC_DETECTORS, [
            {
                DETECTOR_NAME: NULL,
                FUNCTION: detect_null
            },
            {
                DETECTOR_NAME: GEO_ZIP,
                FUNCTION: detect_zip
            },
            DETECTOR_SSN,
            DETECTOR_GEO_ZIP_9,
            DETECTOR_GEO_GPS_LAT_OR_LON,
            DETECTOR_GEO_GPS,
            DETECTOR_GEO_BOROUGH,
            DETECTOR_GEO_ADDRESS,
            # {DETECTOR_NAME: GEO_ADDRESS, FUNCTION: detect_us_address},
            DETECTOR_TEMPORAL_DATE,
            DETECTOR_TEMPORAL_TIME,
            DETECTOR_TEMPORAL_DATE_TIME,
            DETECTOR_PHONE,
            DETECTOR_NUMERIC_INT,
            DETECTOR_NUMERIC_DOUBLE,
            {
                DETECTOR_NAME: TEXTUAL,
                FUNCTION: detect_text
            },
        ]