Exemplo n.º 1
0
def profile_as_job(database_file):
    profiler = Profiler.Profiler()

    global STOP_RUNNING
    if STOP_RUNNING: return

    try:
        pid = str(os.getpid())
        print 'Begin: [' + pid + ']: ' + database_file
        profiler.profile(database_file)
        print 'End: [' + pid + ']: ' + database_file

    except KeyboardInterrupt:
        App.error('KeyboardInterrupt with: ' + database_file)
        STOP_RUNNING = ApplicationOptions.OPTIONS['stop_on_error']

    except:
        msg = '[' + pid + '] ERROR in THREAD:\n'
        msg += '[' + pid + '] -----------------------------------------------------------------\n'
        for line in traceback.format_exc().split('\n'):
            msg += '[' + pid + '] ' + line + '\n'
        msg += '[' + pid + '] -----------------------------------------------------------------'
        # # Will print colored here instead of app.error as facilitates reading error output and debuging
        # print tc.RED + msg + tc.ENDC
        ApplicationOptions.error(msg)
        # raise
    finally:
        return profiler
Exemplo n.º 2
0
def save_complete_dataset(row,
                          count,
                          summaries,
                          types,
                          column_metadata,
                          gps_counts,
                          report_progress=False):
    pid = str(os.getpid())
    try:
        db = save_database(row, count, summaries)
        save_columns(db, types, column_metadata)
        if not IGNORE_INDEX:
            if COPY_GEO_INDEX:
                # link_gps_data(db.database_id, db.id)
                print '         -> GPS Data will be linked to datasets at end of processing.'
            else:
                save_gps_data(db, gps_counts)

        # return db
    except:
        msg = '[' + pid + '] ERROR in THREAD:\n'
        msg += '[' + pid + '] -----------------------------------------------------------------\n'
        msg += '[' + pid + '] ' + traceback.format_exc() + '\n'
        msg += '[' + pid + '] -----------------------------------------------------------------'
        ApplicationOptions.error(msg)
        raise
    if report_progress: report(db)
Exemplo n.º 3
0
def run_bash(bash_command, folder=None):
    cmd = bash_command
    if type(cmd) == str: cmd = cmd.split()
    
    if folder is None:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    else:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, cwd=folder, stderr=subprocess.STDOUT)

    stdout_data, stderr_data = process.communicate()
    if process.returncode != 0:
        message = "%r failed, status code %s stdout %r stderr %r" % (
                       cmd, process.returncode, stdout_data, stderr_data)
        App.error(message)
        raise RuntimeError(message)
    output = ''
    if stdout_data: output += stdout_data
    if stderr_data: output += stderr_data
    return output
Exemplo n.º 4
0
    def profile(self, file_name, skip_rows=0, n_rows=None):
        # print '-----------------> file_name:', file_name
        file_rows = None
        metadata = None
        self.db_name = None
        try:
            metadata, file_rows = ProfilerUtils.init_profiler(
                file_name,
                part=self.part,
                metadata_file=self.metadata_file,
                ignore_metadata=self.ignore_metadata)
            self.db_name = metadata['db_name']
            # This is (a temporary fix) for any uncatched error that can ocour. TODO: Improve this.
            self.last_sumary = self.create_error_sumary(
                'Something went wrong!', file_rows, metadata=metadata)

            self.printv('File rows: {0:n}'.format(file_rows - 1))

            if file_rows == 0:
                self.last_sumary = self.create_error_sumary(
                    'Error: Empty file.', None)
                return

            self.printv('Getting metadata.')
            # metadata = SocrataUtils.metadata_of(self.db_name)

            if self.check_if_skip_dataset(metadata):
                App.warn(
                    'Skipping database analisys: Not primary Socrata Database')
                self.last_sumary = self.create_error_sumary(
                    Profiler.MSG_SKIP_VIEW, file_rows, metadata)

            else:
                self.printv("Loading file: " + file_name)
                database = PandasUtils.load_database(file_name,
                                                     skiprows=skip_rows,
                                                     nrows=n_rows)

                self.do_profile(database, file_name, skip_rows, n_rows,
                                metadata)

        except (KeyboardInterrupt, SystemError) as e:
            ex_type, ex, tb = sys.exc_info()
            error_msg = '\n'.join(traceback.format_tb(tb))
            self.last_sumary = self.create_error_sumary(
                'Interrupted by Keyboard', file_rows, metadata=metadata)
            # logging.exception(e)
            App.error('Interrupted by Keyboard \n' + error_msg)
            raise

        except Exception as e:
            ex_type, ex, tb = sys.exc_info()
            error_msg = '\n'.join(traceback.format_tb(tb))
            App.error(error_msg)
            self.last_sumary = self.create_error_sumary('Error: ' + error_msg,
                                                        file_rows,
                                                        metadata=metadata)
            if self.stop_on_error: raise

        except:
            self.last_sumary = self.create_error_sumary(
                'Unknown Error: {0}'.format(sys.exc_info()),
                file_rows,
                metadata=metadata)
            # logging.exception(sys.exc_info())
            App.error(sys.exc_info())
            if self.stop_on_error: raise
Exemplo n.º 5
0
def main():
    print "===> ETL - Profiler <==="

    # ---------------------> Initialize batch processing
    begin_time = TimeUtils.current_time_formated()
    global opts
    opts = process_args()

    # print '1>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS
    ApplicationOptions.OPTIONS = opts
    # print '2>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS
    verb = ApplicationOptions.get_option('verbose', default=False)
    print '[verbose]=', verb
    # print '3>>>>>>>>>>>>>>>>>>>> ', ApplicationOptions.OPTIONS

    database_files = []

    if opts['sources_in_metadata']:
        print 'Processing metadata csv to get source files.'
        database_files = pandas.read_csv(opts['metadata_file'],
                                         header=0,
                                         index_col=None)
        if 'source_file' not in database_files.columns:
            message = 'metadata file should have a source_file column when "sources_in_metadata" option is used.'
            ApplicationOptions.error(message)
            raise Exception(message)

        database_files = database_files['source_file'].tolist()

    elif opts['cusp_dw']:
        print 'Generating file list based on CUSP DW folder.'
        database_files = CuspDW.tabular_ids_in_DW()
        opts['db_ids_file'] = True
        print 'Found ', len(database_files), ' ids.'
        filename = CuspDW.OUTPUT_FILE_NAME
        if opts['to_folder'] is not None:
            filename = opts['to_folder'] + filename
        CuspDW.save_to_id_file(database_files, filename)
        print 'Saved file with updated ids: ', filename  # ApplicationOptions # TODO: why is ApplicationOptions here?
        return

    elif opts['file_refs'] is None and opts['db_ids_file'] is None:
        database_files = [opts['file']]

    elif opts['db_ids_file'] is not None:
        print 'Generation file list based on: db_ids_file.'
        for id in open(opts['db_ids_file']).readlines():
            database_files.append(id.strip())

    else:
        print 'Generation file list based on: file_refs'
        lines = open(opts['file_refs']).readlines()

        for file in lines:
            if file.strip() == '' or file.startswith('#'): continue
            database_files.append(file.rstrip())

        print 'Databases to profile: {0}'.format(len(database_files))

    profile_databases(database_files)

    # ---------------------> Process Profile data
    process_profiles()
    show_summaries()

    end_time = TimeUtils.current_time_formated()

    # ---------------------> Save Data
    save_files()

    print '\nBegin: ', begin_time
    print 'End: ', end_time

    print '\n\n==> The End. <=='
Exemplo n.º 6
0
def process_profiles():
    try:
        global summaries
        global all_gps_rows
        global geo_index_by_dataset
        global all_gps_rows_by_database
        global all_zip_rows
        global gps_db_count
        global zip_db_count
        global profiled_database_types
        profiled_database_types = None
        global profiled_column_metadata
        profiled_column_metadata = None
        summaries = pandas.DataFrame()
        # Count of rows by ZIP and GPS
        all_gps_rows = pandas.DataFrame()
        all_zip_rows = pandas.DataFrame()
        geo_index_by_dataset = pandas.DataFrame()
        all_gps_rows_by_database = pandas.DataFrame()
        # Count of databases that have ZIP and GPS
        gps_db_count = pandas.DataFrame()
        zip_db_count = pandas.DataFrame()

        i = 0
        for profiler in profilers:
            i += 1
            summaries = pandas.concat([summaries[:], profiler.last_sumary])

            if profiler.last_sumary['ETL-Profiler Status'][0] == 'OK':
                print 'Counting zip and gps data: ', i, '/', len(profilers)
                print '    by Rows'
                all_zip_rows = PandasUtils.merge_series_summing_values(
                    all_zip_rows, profiler.last_zip_rows)
                all_gps_rows = PandasUtils.merge_series_summing_values(
                    all_gps_rows, profiler.last_gps_rows)

                print '    rows by Databases'
                all_gps_rows_by_database = PandasUtils.merge_by_database(
                    all_gps_rows_by_database, profiler.last_gps_rows,
                    profiler.last_sumary.ix[0].Name)
                if not opts['ignore_index']:
                    geo_index_by_dataset = geo_index_by_dataset.append(
                        profiler.last_geo_index, ignore_index=True)

                print '    by Databases'
                # To consider that this database counts only by one, even if it appears more
                temp = pandas.DataFrame(profiler.last_zip_rows,
                                        columns=['count'])
                temp['count'] = 1
                temp = temp['count']
                zip_db_count = PandasUtils.merge_series_summing_values(
                    zip_db_count, temp)

                temp = pandas.DataFrame(profiler.last_gps_rows,
                                        columns=['count'])
                temp['count'] = 1
                temp = temp['count']
                gps_db_count = PandasUtils.merge_series_summing_values(
                    gps_db_count, temp)

                if profiled_database_types is None:
                    profiled_database_types = profiler.types_summary.copy()
                else:
                    profiled_database_types = profiled_database_types.append(
                        profiler.types_summary, ignore_index=True)

                if profiled_column_metadata is None:
                    profiled_column_metadata = profiler.column_metadata.copy()
                else:
                    profiled_column_metadata = profiled_column_metadata.append(
                        profiler.column_metadata, ignore_index=True)
    except:
        if opts['stop_on_error']:
            raise
        ApplicationOptions.error(Exception('Error processing profilers'))

    if 'stop_on_error' in opts and opts['stop_on_error'] and has_error_on(
            summaries):
        ApplicationOptions.error(Exception('Error on summaries'))
Exemplo n.º 7
0
def run(*args):
    begin = current_time_formated()

    # if len(args) != 1: raise Exception('Summaries file must be passed as param. Only this param should be used.')

    print '---------- Considered Configuration ------------'
    print 'Args = ', args
    global COPY_GEO_INDEX
    CLEAN = args[0] == 'clean' or (len(args) > 1 and args[1] == 'clean')
    global IMPORT_ONLY
    for arg in args:
        if arg.startswith("import-only="): IMPORT_ONLY = arg.split('=')[1]
    IGNORE_INDEX = 'ignore-index' in args
    REFRESH_VIEWS = 'skip-refresh-views' not in args
    COPY_GEO_INDEX_ONLY = 'copy-index-only' in args
    COPY_GEO_INDEX = 'copy-geo-index' in args or COPY_GEO_INDEX_ONLY

    print 'CLEAN = ', CLEAN
    print 'IGNORE_INDEX = ', IGNORE_INDEX
    print 'COPY_GEO_INDEX = ', COPY_GEO_INDEX
    print 'COPY_GEO_INDEX_ONLY = ', COPY_GEO_INDEX_ONLY
    print 'REFRESH_VIEWS = ', REFRESH_VIEWS
    print 'IMPORT_ONLY dataset = ', IMPORT_ONLY
    print '----------------------\n\n'

    # link_missing_gpsdata()
    # stop=1/0

    if CLEAN:
        print 'Cleaning database before import.'
        ColumnData.objects.all().delete()
        Column.objects.all().delete()
        GpsData.objects.all().delete()
        Database.objects.all().delete()

    summaires_file = args[0]
    column_metadata_file = summaires_file.replace('.csv',
                                                  '_columns_metadata.csv')
    if not os.path.isfile(column_metadata_file): column_metadata_file = None
    gps_counts_file = summaires_file.replace('.csv', '_index.csv')
    if not os.path.isfile(gps_counts_file): gps_counts_file = None

    if args[0] not in ['update_alarms_only'] and not COPY_GEO_INDEX_ONLY:
        ####################### Open files: summaries, columns, geo_index
        print 'Loading summaries...'
        summaries = pandas.read_csv(summaires_file, na_values="", dtype=object)

        print 'Loading columns...'
        types_file = summaires_file.replace('.csv', '_columns.csv')
        types = pandas.read_csv(types_file)
        print '     > ', len(types)

        print 'Loading column data...'
        if column_metadata_file:
            column_metadata = pandas.read_csv(column_metadata_file)
        print '     > ', len(column_metadata_file)

        gps_counts = None
        if IGNORE_INDEX:
            print 'IGNORING Geo Index...'
        else:
            if COPY_GEO_INDEX:
                print 'Geo Index will be COPIED and Linked after insert datasets ...'
            else:
                print 'Loading Geo Index...'
                if gps_counts_file:
                    gps_counts = pandas.read_csv(gps_counts_file)
                else:
                    gps_counts = pandas.DataFrame()

        print 'Summaries to import: ', len(summaries), '\n\n'
        columns = summaries.columns

        # pool = Pool(WORKERS)
        # print 'Created pool with size:', WORKERS

        global total_databases
        total_databases = len(summaries)
        for count, row in summaries.iterrows():
            if IMPORT_ONLY and row.Name != IMPORT_ONLY: continue
            # if count < 903: continue
            # if row.Name != 'hh8s-wfy6': continue
            try:
                save_complete_dataset(row,
                                      count + 1,
                                      summaries,
                                      types,
                                      column_metadata,
                                      gps_counts,
                                      report_progress=True)
                # print '   + Creating job for ', row.Name
                # pool.apply_async(save_complete_dataset,
                # 	args=(row, count, summaries, types, column_metadata, gps_counts),
                # 	callback= report)

            except Exception as e:
                ApplicationOptions.error(e)
                if STOP_ON_ERROR:
                    raise e
                else:
                    ex_type, ex, tb = sys.exc_info()
                    ERRORS.append('{0}: {1}\n{2}\n'.format(
                        row.Name, e, traceback.format_tb(tb)))
            # break

        # print 'All jobs created. Waiting for results...'
        # pool.close()
        # pool.join()
        # print 'All profilers done.'

        print 'All imported !!!'
        print '=============================='
        print '    Datasets = ', len(summaries)
        print '    Columns = ', len(types)
        print '    Column Data = ', len(column_metadata)
        # print '    GPS Data = ', len(gps_counts)
        print '=============================='

    if COPY_GEO_INDEX:
        print 'Copying GEO INDEX. (This can take some minutes)'
        copy_from_geo_index(gps_counts_file, summaries_file=summaires_file)

    print 'Updating alarms. (This can take sometime)'
    update_alarms()
    if REFRESH_VIEWS and gps_counts_file is not None:
        print 'Refreshing Materialized views. (This can take sometime)'
        refresh_materialized_views()
    else:
        print 'Materialized views were not refreshed.'

    print 'Updating System Control...'
    system = System()
    if args[0] == 'update_alarms_only':
        system.source_file = 'Updated Index Only'
    elif args[0] == 'clean':
        system.source_file = 'Clean DB only'
    else:
        system.source_file = summaires_file
    system.save()

    print 'Clearing cache so it will be updated on demand.'
    # clear_cache()
    print 'clear cache to update Urban Profiler'

    print ' -----------------------------------< ERRORS >------------------------------------------'
    for e in ERRORS:
        print ApplicationOptions.error(e)
    print ' ------------------------------------< END >------------------------------------------'

    end = current_time_formated()

    print '\nBegin:', begin
    print 'End:  ', end

    print '!!! END OF SCRIPT !!!'