Пример #1
0
def get_geo_columns_for_index(column_types, dataset):
    App.debug('Preparing Geo Index')

    geo_cols = {}
    lat_col_name = None
    lon_col_name = None
    for count_cols, row in column_types.iterrows():
        App.debug('row= ', row['profiler-most-detected'], row['column-name'])
        # print "row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO=", row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO
        if row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_GEO:
            col_name = row['column-name']
            App.debug('> Found:', col_name)

            # Improve for LATITUDE and LONGITUDE in different columns
            if row['profiler-most-detected'] == TypeDetector.GEO_GPS_LATLON:
                if 'LONGITUDE' in col_name.upper():
                    lon_col_name = col_name
                elif 'LATITUDE' in col_name.upper():
                    lat_col_name = col_name
            else:
                geo_cols[col_name] = row['profiler-most-detected']

    if lat_col_name and lon_col_name:
        new_gps_col = NEW_GEO_COL_GPS_PREFIX
        dataset[new_gps_col] = PandasUtils.join_lat_lon_into_gps(dataset, lat_col_name, lon_col_name)
        geo_cols[new_gps_col] = TypeDetector.GEO_GPS
        App.debug('CREATED GPS COL:', dataset[new_gps_col])

    App.debug('Geo cols to index:', geo_cols)
    return geo_cols
Пример #2
0
def is_us_address(value):
    try:
        tag = usaddress.tag(value)
        return len(tag) > 2 or tag[-1] is not 'Ambiguous'
    except usaddress.RepeatedLabelError as ex:
        App.debug('Error detecting Geo-Address:', ex)
        return False
Пример #3
0
def detect_null(column):
    App.debug('Detecting: Null')
    not_null_indexes = column.astype(str).apply(
        lambda x: x.lower() not in NULL_VALUES)
    not_null = column[not_null_indexes]
    null_indexes = column[not_null_indexes == False]
    App.debug('   detected: ', len(null_indexes))
    # App.debug('   null_indexes: \n', null_indexes)
    # if len(null_indexes) > 0:
    return NULL, null_indexes, not_null
Пример #4
0
def simplify(column_types_count):
    simple_types_count = {}
    for prefix in TYPE_PREFIXES:
        simple_types_count[prefix] = 0

    for type in column_types_count:
        App.debug('Computing [{0}]: {1}'.format(type,
                                                column_types_count[type]))
        prefix = type.split('-')[0]
        simple_types_count[prefix] += column_types_count[type]

    App.debug('Simple types count: ', simple_types_count)
    return simple_types_count
Пример #5
0
def tabular_ids_in_DW(not_primary_too=True):
    MULTI_THREAD = App.OPTIONS['threads'] > 1
    dir = BASE_PATH
    ids = os.listdir(BASE_PATH)
    App.debug('Verifying if ', len(ids),
              ' ids are from tabular views: (This may take some time)')
    pool = Pool(App.OPTIONS['threads'])
    index = 0
    for id in ids:
        index += 1
        App.debug(index, '/', len(ids), ' Creating job for or processing id: ',
                  id)
        if MULTI_THREAD:
            pool.apply_async(SocrataUtils.metadata_of,
                             args=(id, ),
                             callback=add_id_to_list)
            metadata = SocrataUtils.metadata_of(id)
        else:
            add_id_to_list(SocrataUtils.metadata_of(id))

    if MULTI_THREAD:
        pool.close()
        pool.join()

    App.debug('\n\nFound ', len(tabular_ids), ' tabular ids.')
    App.debug(tabular_ids)
    return tabular_ids
Пример #6
0
def prepare_location_columns(database, metadata_types):
    
    for col in database.columns:
#        print 'Checking:' + col + ' -type:' + metadata_types[col]

        col_is_string = database[col].dtype == object
        if col_is_string and col in metadata_types.keys() and metadata_types[col].lower() == 'location':
            #is it complex ?
            if True in database[col].astype(str).apply(lambda x: '<br />' in x or '\n' in x).value_counts():
                App.debug('Separating location column: ', col)
                #split in multiple columns

                new_col = col + PREFIX_NEW_COLUMN + 'gps'
                database[new_col] = database[col].apply(lambda x: extract_gps_from_composite_location(x))
Пример #7
0
def datetime_from_str_date(string_date):
    try:
        if PandasUtils.is_valid(string_date):
            # print '>>>', string_date, ' type:', type(string_date)
            parsed_date = parser.parse(string_date, fuzzy=True, default=EPOCH)
            return parsed_date
        else:
            return Constants.MISSING_DATA_SYMBOL

    except:
        App.debug(
            'ERROR CONVERTIN DATE FROM STRING: TimeUtils.datetime_from_str_date({0})'
            .format(string_date))
        return Constants.MISSING_DATA_SYMBOL
        raise
Пример #8
0
def metadata_of(database_name, file_name, metadata_file=None):
    if metadata_file is None:
        metadata = SocrataUtils.metadata_of(database_name)
    else:
        metadata = get_metadata_from_file(database_name, file_name,
                                          metadata_file)
        if metadata_source_is_socrata(metadata[SOURCE]):
            print '   Metadata file references Socrata Portal:', metadata[
                MetadataConstants.SOURCE_URL]
            metadata = SocrataUtils.metadata_of(
                database_name,
                portal_url=metadata[MetadataConstants.SOURCE_URL])
            metadata = metadata

    App.debug(
        json.dumps(metadata, ensure_ascii=False, indent=4, sort_keys=True))
    return metadata
Пример #9
0
def most_detected(detected_types):
    App.debug('Detected types: ', detected_types.keys())
    most_detected_type = NULL
    precision = detected_types[most_detected_type]

    if detected_types is None or len(detected_types) == 0:
        return NULL, 100  #%

    for key in detected_types.keys():
        current = detected_types[key]
        App.debug('Current: [{0}]={1}'.format(key, current))
        if current > 0 and current >= precision:
            most_detected_type = key
            precision = detected_types[most_detected_type]
            App.debug('most_detected updated with key:', key)

    App.debug('[most_detected] detected_types=', detected_types)
    App.debug('[most_detected]:', most_detected_type)
    return most_detected_type, precision
Пример #10
0
def get_temp_columns_for_index(column_types, dataset):
    App.debug('Preparing Temporal Index')

    temp_cols = {}
    date_col_name = None
    time_col_name = None
    for count_cols, row in column_types.iterrows():
        if row['profiler-most-detected'] in INDEX_ACCEPTED_TYPES_TEMP:
            col_name = row['column-name']
            App.debug('> Found:', col_name)

            if col_name.upper() == 'DATE':
                date_col_name = col_name
            elif col_name.upper() == 'TIME':
                time_col_name = col_name
            else:
                temp_cols[col_name] = row['profiler-most-detected']

    # print 'date_col_name and time_col_name=', date_col_name, time_col_name
    # If dataset has TIME and DATE join both as one column
    if date_col_name and time_col_name:
        new_col_name = NEW_TEMP_COL_DATETIME_PREFIX
        dataset[new_col_name] = TimeUtils.join_date_and_time(dataset[date_col_name], dataset[time_col_name])
        temp_cols[new_col_name] = TypeDetector.TEMPORAL_DATE_TIME

    # Has only date, but not date and time. If has only time, disconsider it
    elif date_col_name:
        temp_cols[date_col_name] = row['profiler-most-detected']

    App.debug('Temp cols to index', temp_cols)
    return temp_cols
Пример #11
0
def types_of(column):
    App.debug('Detecting types of: ', column.name)
    App.debug('    size: ', len(column))
    detectors_type, detectors = data_detectors()
    App.debug('    Initializing detected_types. ')
    detected_types = {}
    # Initialize with all zeros
    for detector in detectors:
        detected_types[detector[DETECTOR_NAME]] = 0.0
    if len(column) == 0:
        App.debug('Empty column!')
        return detected_types

    remaining_values_to_detect_type = column.copy()

    ## If column is in unicode, transform to ASCII to avoid errors during processing.
    ## Check for unicode in column values
    unicode_values = remaining_values_to_detect_type.apply(
        lambda x: (type(x) is unicode))
    unicode_values_counts = unicode_values.value_counts()
    ## Transform the unicode values into ascii if there are any
    if True in unicode_values_counts.keys(
    ) and unicode_values_counts[True] > 0:
        App.info('Recoding values... (this can take some time)')
        remaining_values_to_detect_type = remaining_values_to_detect_type.apply(
            lambda x: TextUtils.reencode_text_if_not_ascii(x))

    for detector in detectors:
        detected, not_detected, type_name = detect_type(
            detector, detectors_type, remaining_values_to_detect_type)
        detected_types[type_name] = round(
            len(detected) * 100.0 / len(column), PERCENTUAL_PRECISION)
        remaining_values_to_detect_type = not_detected
        App.debug('    Remaining: ', len(not_detected))


#        if len(remaining_values_to_detect_type) == 0:
#            break
    return detected_types
Пример #12
0
def detect_text(column):
    App.debug('Detecting: Text')
    nulls, not_nulls = detect_null(column)[1:]

    App.debug('Text Values:', not_nulls.values[:10])
    App.debug('Non Text Values:', nulls.values[:10])

    return TEXTUAL, not_nulls, nulls
Пример #13
0
def generate_index_on(index_geo_cols, index_temp_cols, dataset, db_name):
    index = pandas.DataFrame(columns=INDEX_COLUMNS)

    # No columns to generate index
    if len(index_geo_cols.keys()) == 0 and len(index_temp_cols.keys()) == 0: return index

    # Prepare the list of cols
    # If is empty add None just to loop into it and call the generate_partial_index function
    if index_geo_cols is None or len(index_geo_cols) == 0: index_geo_cols[PHANTON_COL] = None
    if index_temp_cols is None or len(index_temp_cols) == 0: index_temp_cols[PHANTON_COL] = None

    # Clean dataset before create partial index
    print 'Cleaning dataset to process index'
    print 'dataset size:', len(dataset)
    cols_to_clean = index_geo_cols.copy()
    cols_to_clean.update(index_temp_cols)
    for col in cols_to_clean:
        print '     > {0} - {1}'.format(col, cols_to_clean[col]).ljust(50) + '@' + TimeUtils.current_time_formated()
        # If current col is the PHANTON col, skip it
        if col is PHANTON_COL: continue
        clean_invalid_values(cols_to_clean[col], col, dataset)
        print '          dataset size:', len(dataset)

    for geo_col in index_geo_cols.keys():
        geo_type = index_geo_cols[geo_col]

        for temp_col in index_temp_cols.keys():
            temp_type = index_temp_cols[temp_col]

            an_index = generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name)
            App.info('	Adding to index... '.ljust(50) + '@' + TimeUtils.current_time_formated())
            index = index.append(an_index, ignore_index=True)

    App.info('Index created with {0} rows'.format(len(index)))
    App.debug('>>>>> INDEX <<<<<\n', index)

    return index
Пример #14
0
def valid_values_of_type(type_name, column_values):
    App.debug('valid_values_of_type()')
    detectors_type, type_detectors = data_detectors()
    for detector in type_detectors:
        if detector['name'] == type_name:
            App.debug('Detecting valid values for:', type_name)
            detected, not_detected, type_name = detect_type(
                detector, detectors_type, column_values)
            # type_name, detected, not_detected = detect_using_dynamic(detector, column_values)
            App.debug('Detected: ', len(detected))
            return detected
    return None
Пример #15
0
def add_id_to_list(metadata):
    try:
        if metadata[SocrataUtils.STATUS] <> SocrataUtils.STATUS_SUCCESS:
            App.debug('Metadata not found.')
            return

        id = metadata['Socrata Id']
        #    App.debug( 'Begin: [' + str(os.getpid()) + ']: ' + id)
        if metadata[SocrataUtils.DISPLAY_TYPE_KEY] == 'table':
            if SocrataUtils.is_primary(
                    metadata) or App.OPTIONS['process-views']:
                App.debug('    ', id, ': True')
                tabular_ids.append(id)
            else:
                App.debug('    ', id, ': True (Skipped - view)')
        else:
            App.debug('    ', id, ': False')


#    App.debug( 'End: [' + str(os.getpid()) + ']: ' + id)
    except:
        print 'Error: ', traceback.format_exc()
Пример #16
0
def detect_us_address(column):
    type = GEO_ADDRESS
    prepared_col_data = column.dropna()

    is_address = prepared_col_data.astype(str).str.upper().apply(
        lambda x: is_us_address(x))
    detected = prepared_col_data[is_address == True]
    not_detected = prepared_col_data[is_address == False]

    App.debug('Detected Type:', type)
    App.debug('Detected Values:', len(detected), ' - ', detected.values[:10])
    App.debug('Non Detected Values:', len(not_detected), ' - ',
              not_detected.values[:10])

    return type, detected, not_detected
Пример #17
0
def metadata_of(database_id, first=True, portal_url=NYC_OPENDATA_URL_BASE):
    App.debug(' SocrataUtils.metadata_of({0})'.format(database_id))
    
    url = portal_url + '/views/' + database_id + JSON_EXTENSION + APP_TOKEN_PARAM
    # App.debug('url: ', url)
    metadata = {'source':'Socrata'}
    # try:
    if True:
        App.debug('Url to get metadata from: ' + url)
        response = urllib.urlopen(url)
        data = json.loads(response.read())
        
        if 'id' in data and data['id'] == database_id:
            App.debug('    -> Success retrieving metadata!')
            App.debug('Retrieved metadata Keys:\n - ' + '\n - '.join(data.keys() ))
            App.debug('Retrieved metadata:\n' + json.dumps(data, indent=4, sort_keys=True))
            App.debug('==========================================')
            
            if 'rowIdentifierColumnId' in data:
                id_column_id = data['rowIdentifierColumnId']
                for col in data['columns']:
                    if col['id'] == id_column_id: 
                        metadata[MetadataConstants.ID_COLUMN] = col['name']
            else:
                metadata[MetadataConstants.ID_COLUMN] = None
                

            metadata[MetadataConstants.METADATA_SOURCE_URL] = key_as_str(data, url)
            metadata[MetadataConstants.METADATA_SOURCE_NAME] = key_as_str(data, 'Socrata Portal ' + portal_url)
                
            metadata[MetadataConstants.NAME] = key_as_str(data, 'name')
            metadata[MetadataConstants.PREFIX + 'Description'] = key_as_str(data, 'description')
            metadata[MetadataConstants.DISPLAY_TYPE_KEY] = key_as_str(data, 'displayType')
            metadata[MetadataConstants.PREFIX + 'Category'] = key_as_str(data, 'category')
            metadata[MetadataConstants.PREFIX + 'Owner'] = key_as_str(data['owner'], 'displayName')
            metadata[MetadataConstants.PREFIX + 'Download Count'] = key_as_str(data, 'downloadCount')
            metadata[MetadataConstants.PREFIX + 'View Count'] = key_as_str(data, 'viewCount')
            metadata[MetadataConstants.PREFIX + 'Comments'] = key_as_str(data, 'numberOfComments')
            metadata[MetadataConstants.PREFIX + 'Author'] = key_as_str(data['tableAuthor'], 'displayName')
    	    metadata[MetadataConstants.PREFIX + 'Id'] = key_as_str(data, 'id')
    	    metadata[MetadataConstants.PREFIX + 'Attribution'] = key_as_str(data, 'attribution')
            metadata[MetadataConstants.PREFIX + 'View Type'] = key_as_str(data, 'viewType')
            metadata[MetadataConstants.PREFIX + 'Display Type'] = key_as_str(data, 'displayType')
            metadata[MetadataConstants.PREFIX + 'Number of Coments'] = key_as_str(data, 'numberOfComments')
            ##> Discover if this dataset is a view
            if 'modifyingViewUid' not in data: metadata[MetadataConstants.PREFIX + 'View From'] = None
            else: metadata[MetadataConstants.PREFIX + 'View From'] = key_as_str(data,'modifyingViewUid')
            
            timestamp = int(data['createdAt'].__str__())
            metadata[MetadataConstants.PREFIX + 'Created At'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            timestamp = int(data['viewLastModified'].__str__())
            metadata[MetadataConstants.PREFIX + 'Last Modified'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            timestamp = int(data['publicationDate'].__str__())
            metadata[MetadataConstants.PREFIX + 'Publication Date'] = datetime.datetime.fromtimestamp(timestamp).__str__()
            metadata['Tags'] = key_as_str(data, 'tags')
            if metadata['Tags'] == 'None': metadata['Tags'] = None
            
            if 'metadata' in data and 'custom_fields' in data['metadata']:
                custom_fields = data['metadata']['custom_fields']
                if 'Update' in custom_fields and 'Update Frequency' in custom_fields['Update']: 
                    metadata[MetadataConstants.PREFIX + 'Update Frequency'] = custom_fields['Update']['Update Frequency'].__str__()
                if 'Dataset Information' in custom_fields and 'Agency' in custom_fields['Dataset Information']: 
                    metadata[MetadataConstants.PREFIX + 'Agency'] = custom_fields['Dataset Information']['Agency'].__str__()

            types = {}
            columns = data['columns']
            for col in columns:
                col_name = col['name'].strip(' ').encode('ascii','ignore')
                col_type = col['dataTypeName']
                types[col_name] = col_type
            metadata[MetadataConstants.PREFIX + 'Types'] = types

            metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_SUCCESS
        else:
            if 'Cannot find view with id' in data['message']:
                metadata[MetadataConstants.STATUS] = MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND 
            else:
                metadata[MetadataConstants.STATUS] = 'Error'
            metadata['message'] = data['message']
    # except e:
    #     raise e
    #      #This means that it is not from socrata
    #      # Or that some other error occurred
    #      #just return None
    #     if first: 
    #         App.debug('Waiting to try again')
    #         sleep(0.5)
    #         return metadata_of(database_id, first=False)
    #     metadata[MetadataConstants.STATUS] = 'Error Exception'
    #     metadata['message'] = 'Error acessing a Socrata Portal with url: {0}'.format(url)
    
    if metadata[MetadataConstants.STATUS] is not MetadataConstants.STATUS_SUCCESS: 
        # logging.warn(metadata[STATUS])
        App.debug('WARNING: ', metadata[MetadataConstants.STATUS])
    
    #before return, turn the Unicodes to normal str
    for k in metadata.keys():
        TextUtils.reencode_text_if_not_ascii(metadata[k])
#         if type(metadata[k]) is unicode: 
# #            print '    Unicode info found on key: ' , k, '=', metadata[k] 
#             metadata[k] = metadata[k].encode('ascii','ignore')

    ##> If there was an error, show url so user can check
    if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_ERROR_VIEW_NOT_FOUND: 
        App.info('    Metadata not found on Socrata with url: ' + url)
    
    ##> Show dataset retrieved name to indicate success
    if metadata[MetadataConstants.STATUS] == MetadataConstants.STATUS_SUCCESS: 
        App.info('    OK. Dataset Retrieved Name: ' + metadata[ MetadataConstants.NAME ] )

    App.debug('Retrieved Metadata: \n' + json.dumps(metadata, ensure_ascii=False, indent=4, sort_keys=True) )
    return metadata
Пример #18
0
def detect_from_regex(regex_detector, column):
    type_name = regex_detector[DETECTOR_NAME]
    regex_list = regex_detector[REGEX_LIST]
    all_matched_indexes = []

    App.debug('Detecting: ', type_name, ' with ', len(regex_list),
              ' regexes on ', len(column), ' items.')
    if len(column) == 0:
        return type_name, column, column

    for regex in regex_list:
        matched_indexes = column.index[column.astype(str).str.upper().apply(
            lambda x: regex.match(x) is not None)]
        App.debug('   matched: ', len(matched_indexes))
        all_matched_indexes += list(matched_indexes)

        #stop if all is matched
        if len(all_matched_indexes) == len(column): break

    App.debug('all_matched_indexes size: ', len(all_matched_indexes))
    all_matched_indexes = set(all_matched_indexes)
    not_matched_indexes = set(column.keys()).difference(all_matched_indexes)

    if len(all_matched_indexes) > 0:
        detected = column[list(all_matched_indexes)]
    else:
        detected = pandas.Series()
    not_detected = column[list(not_matched_indexes)]

    App.debug('Example of Values: ')
    App.debug('   detected: ', detected[:10].values)
    App.debug('   not-detected: ', not_detected[:10].values)

    App.debug('Result: ')
    App.debug('   detected: ', len(detected))

    return type_name, detected, not_detected
Пример #19
0
def simple_type_of_considering_all(column_types_count, metadata_type,
                                   column_name):
    App.debug('[simple_type_of_considering_all]')
    App.debug('column_types_count=', column_types_count)
    App.debug('metadata_type=', metadata_type)
    App.debug('column_name=', column_name)

    simple_types = simplify(column_types_count)
    App.debug('simple_types=', simple_types)

    #From Socrata
    if metadata_type is not None:
        if metadata_type == 'calendar_date': return TEMPORAL
        elif metadata_type == 'location': return GEO

    # With name insight
    upper_column_name = column_name.upper()
    App.debug('match_name(column_name, NAME_DETECTOR_GEO):',
              match_name(upper_column_name, NAME_DETECTOR_GEO))
    App.debug('match_name(column_name, NAME_DETECTOR_TEMPORAL):',
              match_name(upper_column_name, NAME_DETECTOR_TEMPORAL))
    if simple_types[GEO] > 0 and match_name(upper_column_name,
                                            NAME_DETECTOR_GEO):
        return GEO

        #remove if is GPS but name is not on name list
    elif most_detected(simple_types)[0] == GEO_GPS:
        App.debug('Removing GEO as an option!')
        simple_types.pop(GEO, None)
    if simple_types[TEMPORAL] > 0 and match_name(upper_column_name,
                                                 NAME_DETECTOR_TEMPORAL):
        return TEMPORAL

    return most_detected(simple_types)[0]
Пример #20
0
def load_database(database_file, skiprows=None, nrows=None):
    # It is a socrata CSV database. The wget on compute is not geting the extension as should.
    file_type = 'CSV'  #default if no extension is found.
    if database_file.endswith('.csv'): file_type = 'CSV'
    if database_file.endswith('.json'): file_type = 'JSON'

    file_encoding = get_encoding(database_file)
    App.info('   > File encoding: %s' % file_encoding)

    if file_type == 'CSV':
        App.debug('CSV: Reading column headers from first line.')
        cols = FileUtils.get_cols_from_csv_header(database_file)
        App.debug('Preparing column types for pandas.')
        dtypes = prepare_dtypes_for_loading(cols)
        try:
            App.debug('Trying to read csv...')
            return pandas.read_csv(database_file,
                                   skiprows=skiprows,
                                   nrows=nrows,
                                   low_memory=LOW_MEMORY,
                                   encoding=file_encoding,
                                   dtype=dtypes)
        except:
            App.debug('Default CSV did not work.')
            App.debug('Trying to read with tab as separator...')
            # This error can be because the file is a tab separated values instead of comma
            return pandas.read_csv(database_file,
                                   skiprows=skiprows,
                                   nrows=nrows,
                                   low_memory=LOW_MEMORY,
                                   encoding=file_encoding,
                                   sep='\t',
                                   dtype=dtypes)

    elif file_type == 'JSON':
        # This works for json under socrata format, which have data field.
        # If not this way, lets supose it is already the data.
        json_file = open(database_file)
        json_data = json.load(json_file)

        if 'data' in json_data.keys():
            App.debug('JSON: Read data from data field. (Socrata format)')
            data = json_data['data']
            cols = []
            cols_with_sub_cols = []

            App.debug('Getting column names from metadata...')
            for col in json_data['meta']['view']['columns']:
                cols.append(col['name'])

                if 'subColumnTypes' in col.keys():
                    print '    (!) Column ', col[
                        'name'], ' has sub columns: ', col['subColumnTypes']
                    cols_with_sub_cols.append(col)

            dtypes = prepare_dtypes_for_loading(cols)
            df = pandas.DataFrame(data, columns=cols)

            #create subcolumn data
            for col in cols_with_sub_cols:
                print '    Fetching sub columns of ', col['name']
                i = 0
                for sub_col in col['subColumnTypes']:
                    print '         >', sub_col
                    df[col['name'] + NEW_COLUMN_NAME_SEPARATOR +
                       sub_col] = df[col['name']].apply(lambda x: x[i])
                    i += 1
                print '    Removing source column ', col[
                    'name'], ' from data frame.'
                #Then remove multivalored column
                df.drop(col['name'], axis=1, inplace=True)
            return df

        else:
            App.debug(
                'JSON: There is no data field. Getting column names from JSON keys.'
            )
            #get the list of cols from the json
            cols = list(json_data.keys())
            dtypes = prepare_dtypes_for_loading(cols)
            return pandas.DataFrame(json_data, dtypes=dtypes)
    else:
        print '===> PandasUtilError: Invalid database file: [{0}]'
        #        raise ApplicationExecption('File must be json or csv!'.format(database_file))
        raise RuntimeError(
            'File must be json (with data inside a data field) or csv!'.format(
                database_file))
Пример #21
0
def generate_partial_index_on(geo_col, geo_type, temp_col, temp_type, dataset, db_name):
    App.info('Generating index for ({0}) and ({1})'.format(geo_col, temp_col), TimeUtils.current_time_formated())
    print geo_type

    an_index = pandas.DataFrame(columns=INDEX_COLUMNS)
    countby = []

    # ## 1. ADD GEO VALUES TO INDEX
    if geo_type:
        App.info('	Processing geo part... '.ljust(50) + '@' + TimeUtils.current_time_formated())
        # TODO: ENHANCE GEO INDEX <-------- (TODO)
        if geo_type == TypeDetector.GEO_GPS:
            an_index.lat, an_index.lon = PandasUtils.get_lat_lon_from_gps(dataset[geo_col])
            countby += ['lat', 'lon']

        elif geo_type in [TypeDetector.GEO_ZIP, TypeDetector.GEO_ZIP_9]:
            an_index.zipcode = dataset[geo_col]
            countby += ['zipcode']

    # ## 2. ADD TEMPORAL VALUES TO INDEX
    if temp_type:
        App.info('	Processing temporal part... '.ljust(50) + '@' + TimeUtils.current_time_formated())
        datetimes = dataset[temp_col].apply(lambda x: TimeUtils.datetime_from_str_date(x))
        if temp_type in [TypeDetector.TEMPORAL_DATE, TypeDetector.TEMPORAL_DATE_TIME]:
            # an_index['epoch_secs'] = dataset[temp_col].apply(lambda x: TimeUtils.epoch_from_str_date(x))

            an_index['year'] = datetimes.apply(lambda x: str(x.year) if x else Constants.MISSING_DATA_SYMBOL)
            an_index['month'] = datetimes.apply(lambda x: str(x.month) if x else Constants.MISSING_DATA_SYMBOL)
            an_index['day'] = datetimes.apply(lambda x: str(x.day) if x else Constants.MISSING_DATA_SYMBOL)
            # countby += ['epoch_secs']
            countby += ['year', 'month', 'day']

        # if temp_type == TypeDetector.TEMPORAL_DATE_TIME:
        # 	an_index['hour'] = datetimes.apply(lambda x: str(x.hour) if x else Constants.MISSING_DATA_SYMBOL)

        App.info('	Counting... '.ljust(50) + '@' + TimeUtils.current_time_formated())

    # This order cannot change unless change this algorithm! First count, then clean
    # --------- Count rows for Index ------------------------------------------------------------------
    # 3. create index counts
    # print '-------------------- countby=', countby
    temp = an_index[countby].reset_index().groupby(countby).agg(['count'])
    temp.columns = ['count']
    temp.reset_index(inplace=True)
    # join with real dataset and add to index
    merged = pandas.merge(an_index, temp, how='inner', on=countby)
    # Add count to an_index
    an_index['count'] = merged['count']

    # --------- 4. Clean Index: null and invalid values --------------------------------------------------
    # print '<><><><><><><><><><><><><> an_index.count()=', an_index.count()
    App.info('	Cleaning... '.ljust(50) + '@' + TimeUtils.current_time_formated())

    used_index_cols = list(an_index.count()[an_index.count() > 0].index)
    for col in used_index_cols:
        # geo
        if col in ['lat', 'lon']: col_type = TypeDetector.GEO_GPS_LATLON
        if col == 'zipcode': col_type = TypeDetector.GEO_ZIP
        if col == 'address': col_type = TypeDetector.GEO_ADDRESS
        if col == 'borough': col_type = TypeDetector.GEO_BOROUGH
        # temp
        if col in ['epoch_secs', 'year', 'month', 'day', 'hour']: col_type = TypeDetector.NUMERIC_INT
        App.info('	   > {0}: {1}'.format(col, col_type).ljust(50) + '@' + TimeUtils.current_time_formated())
        # clean_invalid_values(col_type, col, an_index)
        an_index = an_index[an_index[col].apply(lambda x: PandasUtils.is_valid(x))]

    App.debug('>>>>> an_index (len 20) <<<<<')
    App.debug(an_index[:20])
    App.info('     Partial Index created with {0} rows'.format(len(an_index)))
    # 5. return index to be added to the main index
    return an_index
Пример #22
0
def detect_zip(column):
    App.debug('Detecting: ZIP')

    prepared_col_data = column.dropna()

    if column.dtype is not numpy.int64:
        App.debug(
            'Removing .0 from string. As can be float64 due to Pandas issue.')
        prepared_col_data = prepared_col_data.astype(str).apply(
            lambda x: x.replace('.0', ''))
        App.debug('Prepared data:\n', prepared_col_data)


#   type, detected, not_detected = detect_from_regex(DETECTOR_GEO_ZIP, prepared_col_data)
    type = GEO_ZIP
    #    print '--------------------------------------------'
    #    print column.astype(str).str.upper().apply(lambda x: x in VALID_ZIP_CODES is True)
    #    print '--------------------------------------------'
    #    print column.astype(str).str.upper().apply(lambda x: x not in VALID_ZIP_CODES is True)
    #    print '--------------------------------------------'
    detected = prepared_col_data[prepared_col_data.astype(
        str).str.upper().apply(lambda x: x in VALID_ZIP_CODES)]
    not_detected = prepared_col_data[prepared_col_data.astype(
        str).str.upper().apply(lambda x: x not in VALID_ZIP_CODES)]

    App.debug('Detected Type:', type)
    App.debug('Detected Values:', len(detected), ' - ', detected.values[:10])
    App.debug('Non Detected Values:', len(not_detected), ' - ',
              not_detected.values[:10])

    return type, detected, not_detected
Пример #23
0
def detect_using_dynamic(detector, values_to_detect_type):
    App.debug('Dynamic Detecting: ', detector[DETECTOR_NAME])
    App.debug('    ACCEPT_NULLS: ', detector[ACCEPT_NULLS])
    if detector[DICTIONARY]:
        App.debug('    DICTIONARY (len): ', len(detector[DICTIONARY]),
                  ' - Sample:',
                  list(detector[DICTIONARY])[:10])
    if detector[REGEX_LIST]:
        App.debug('    REGEX_LIST: ', len(detector[REGEX_LIST]))

    if detector[ACCEPT_NULLS]:
        App.debug('    detect_null(...) ')
        return detect_null(values_to_detect_type)

    elif detector[DICTIONARY]:
        App.debug('    detect_from_dictionary(...) ')
        return detect_from_dictionary(detector, values_to_detect_type,
                                      detector[DICTIONARY_COMPARISON_TYPE])

    elif detector[REGEX_LIST][0]:
        App.debug('    detect_from_regex(...) ')
        return detect_from_regex(detector, values_to_detect_type)
Пример #24
0
def detect_from_dictionary(detector,
                           values_to_detect_type,
                           comparison_type="Equal"):
    type_name = detector[DETECTOR_NAME]
    dictionary = detector[DICTIONARY]
    all_matched_indexes = []

    App.debug('Detecting: ', type_name)
    App.debug('    with ', len(dictionary), ' dictionaty entries on ',
              len(values_to_detect_type), ' items.')

    if len(values_to_detect_type) == 0:
        return type_name, values_to_detect_type, values_to_detect_type

    prepared_values = values_to_detect_type
    if values_to_detect_type.dtype == numpy.float64:
        prepared_values = prepared_values.astype(str).apply(
            lambda x: x.replace('.0', ''))

    App.debug('comparison_type: ', comparison_type)
    if comparison_type == "Equal":
        matched_indexes = prepared_values.index[prepared_values.astype(
            str).str.upper().apply(lambda x: x in dictionary)]
    elif comparison_type == "Contains":
        contains = lambda x: any(valid in x for valid in list(dictionary))
        matched_indexes = prepared_values.index[prepared_values.astype(
            str).str.upper().apply(contains)]
    elif comparison_type == "Contains Word":
        contains = lambda x: any(word in dictionary for word in x.split(' '))
        matched_indexes = prepared_values.index[prepared_values.astype(
            str).str.upper().apply(contains)]
    else:
        error_message = 'Unknown Comparison type "' + comparison_type + '" in Dynamic Types File.'
        raise Exception(error_message)

    App.debug('   matched: ', len(matched_indexes))
    all_matched_indexes = list(matched_indexes)

    App.debug('all_matched_indexes size: ', len(all_matched_indexes))
    all_matched_indexes = set(all_matched_indexes)
    not_matched_indexes = set(
        values_to_detect_type.keys()).difference(all_matched_indexes)

    detected = values_to_detect_type[all_matched_indexes]
    not_detected = values_to_detect_type[not_matched_indexes]

    App.debug('Example of Values: ')
    App.debug('   >> detected: ', detected[:10].values)
    App.debug('   not-detected: ', not_detected[:10].values)

    App.debug('Result: ')
    App.debug('   detected: ', len(detected))

    return type_name, detected, not_detected
Пример #25
0
def data_detectors():
    #    print 'aaaaaaaaaaaaaaaaaaaaaaaa'
    #    print 'TYPES_REFERECE_FILE=', TYPES_REFERECE_FILE
    #    print 'os.path.exists(TYPES_REFERECE_FILE)= ', os.path.exists(TYPES_REFERECE_FILE)

    types_file = App.get_option('types_file', default=None)
    if types_file and types_file.lower() == 'true':
        types_file = TYPES_REFERENCE_FILE
    if types_file and os.path.exists(types_file):
        global LOADED_DETECTORS
        if LOADED_DETECTORS is None:
            App.debug(' >>> Loading dynamic types from file: ', types_file)
            types = pandas.read_csv(types_file,
                                    header=None,
                                    skipinitialspace=True)
            types = types.where((pandas.notnull(types)),
                                None)  #Transform NaN into None
            LOADED_DETECTORS = []
            for i in types.index:
                App.debug("")
                #1. Name
                name = types.ix[i][0]
                if types.ix[i][0] != types.ix[i][1]:
                    name += '-' + types.ix[i][1]
                App.debug("name= ", name)
                #2. Regex
                regex_list = types.ix[i][2]
                App.debug("regex= ", regex_list)
                if type(regex_list) == str:
                    regex_list = re.compile(types.ix[i][2])
                #3 & 4. Prepare values dictionary
                values_dictionary = types.ix[i][3]
                App.debug("values_dictionary= ", values_dictionary)

                dictionary_is_file = types.ix[i][4]
                App.debug("dictionary_is_file= ", dictionary_is_file)

                if type(values_dictionary) == str:  #is not None or Nan
                    #Read the file into the csv
                    if dictionary_is_file:
                        with open(
                                ResourceUtils.resource_path_of(
                                    values_dictionary)) as dict_file:
                            values_dictionary = dict_file.read()

                    #Parse string CSV into a set
                    reader = csv.reader(values_dictionary.splitlines(),
                                        delimiter=',',
                                        skipinitialspace=True)
                    values_dictionary = []
                    for row in reader:
                        values_dictionary.extend(row)
                    values_dictionary = set(values_dictionary)

                #5. Accept Nulls?
                accept_nulls = types.ix[i][5]
                App.debug("accept_nulls= ", accept_nulls)
                #6. Comparison type
                comparisson_type = types.ix[i][6]
                App.debug("Dictionary comparisson type= ", comparisson_type)

                LOADED_DETECTORS.append({
                    DETECTOR_NAME:
                    name,
                    REGEX_LIST: [regex_list],
                    DICTIONARY:
                    values_dictionary,
                    ACCEPT_NULLS:
                    accept_nulls,
                    DICTIONARY_COMPARISON_TYPE:
                    comparisson_type,
                })
            App.debug('Loaded types:')
            for item in LOADED_DETECTORS:
                App.debug(item[DETECTOR_NAME])
        return 'Dynamic', LOADED_DETECTORS

    else:
        #Detector must be in desired order to run
        return STATIC_DETECTORS, [
            {
                DETECTOR_NAME: NULL,
                FUNCTION: detect_null
            },
            {
                DETECTOR_NAME: GEO_ZIP,
                FUNCTION: detect_zip
            },
            DETECTOR_SSN,
            DETECTOR_GEO_ZIP_9,
            DETECTOR_GEO_GPS_LAT_OR_LON,
            DETECTOR_GEO_GPS,
            DETECTOR_GEO_BOROUGH,
            DETECTOR_GEO_ADDRESS,
            # {DETECTOR_NAME: GEO_ADDRESS, FUNCTION: detect_us_address},
            DETECTOR_TEMPORAL_DATE,
            DETECTOR_TEMPORAL_TIME,
            DETECTOR_TEMPORAL_DATE_TIME,
            DETECTOR_PHONE,
            DETECTOR_NUMERIC_INT,
            DETECTOR_NUMERIC_DOUBLE,
            {
                DETECTOR_NAME: TEXTUAL,
                FUNCTION: detect_text
            },
        ]
Пример #26
0
    def do_profile(self, database, file_name, skip_rows, n_rows, metadata):
        printv = self.printv
        self.db_name = metadata['db_name']
        self.last_sumary = None
        self.column_metadata = pd.DataFrame(
            columns=['Database-id', 'Column-name', 'Group', 'Key', 'Value'])

        readable_time_start = TimeUtils.current_time_formated()
        time_start = time.time()

        total_rows = len(database.index)
        total_missing_values = 0

        text_cols = []
        text_cols_names = []
        num_cols = []
        num_cols_names = []
        geo_cols = []
        geo_cols_names = []
        temp_cols = []
        temp_cols_names = []
        null_cols = []
        null_cols_names = []
        self.gps_cols = []
        self.zip_cols = []
        self.types_summary = pd.DataFrame(columns=[
            'database-id', 'column-name', 'socrata-type', 'profiler-type',
            'profiler-most-detected'
        ])

        str_cols_types_percent = ''
        str_cols_types_percent_simple = ''

        printv('\nMetadata Types (Socrata Only): ')
        metadata_types = {}
        if metadata is not None and MetadataConstants.TYPES in metadata.keys():
            metadata_types = metadata[MetadataConstants.TYPES]
        for col in metadata_types.keys():
            printv('    ' + col + ': ' + metadata_types[col])

        SocrataUtils.prepare_location_columns(database, metadata_types)
        col_names = database.dtypes.keys()

        printv('\nProfiling {0} columns.'.format(col_names.size))
        # print 'col_names=', col_names
        for i in range(0, col_names.size):
            # reset col description
            unique = counts = vmin = vmax = std = mean = length_min = length_max = length_std = length_mean = missing = None
            col = database[col_names[i]]
            col_name = col_names[i]

            App.debug('Profiling column: ', col_name)
            App.debug('Pandas DType: ', col.dtype)

            if col_name in metadata_types:
                metadata_type = metadata_types[col_name]
            else:
                metadata_type = ''
            App.debug('Metadata type: ', metadata_type)

            unique_data = pd.DataFrame()
            temp = col.dropna()
            temp = temp.unique(
            )  # <= gives this error probably is a pandas error: TypeError: unhashable type: 'dict'
            unique_data[col_name] = temp
            unique_data = unique_data[col_name]

            col_types_percent_detailed = TypeDetector.types_of(unique_data)
            App.debug('[var] col_types_percent_detailed: ',
                      col_types_percent_detailed)

            processed_col_name = col_name.replace("'", "`")
            str_cols_types_percent += "'{0}': {1}\n".format(
                processed_col_name, col_types_percent_detailed)

            col_types_percent = TypeDetector.simplify(
                col_types_percent_detailed)
            printv('{0}: {1}'.format(col_name, col_types_percent))
            str_cols_types_percent_simple += "'{0}': {1}\n".format(
                processed_col_name, col_types_percent)

            data_type = TypeDetector.simple_type_of_considering_all(
                col_types_percent_detailed, metadata_type, col_name)

            most_detected, precision = TypeDetector.most_detected(
                col_types_percent_detailed)

            valid_col_values = TypeDetector.valid_values_of_type(
                most_detected, col)
            ########## Geographic  ##########
            if data_type is TypeDetector.GEO:
                printv(
                    "Processing Column {0}: {1} - Geo".format(
                        i + 1, col_names[i]), str(i + 1))
                # printv('    Processing Count', '.')
                count = col.count()
                missing = total_rows - count
                total_missing_values += missing
                # printv('    Processing Value Counts', '.')
                value_counts = valid_col_values.value_counts()
                top = value_counts.first_valid_index()
                freq = 0
                if top is not None:
                    freq = str(value_counts[top])

                # improve readability if is zip code
                if type(top) is not str and isinstance(top, int):
                    top = '%i' % top

                unique = len(value_counts)
                geo_cols_names.append(col_name)
                geo_cols.append({
                    'Count':
                    count,
                    'Missing':
                    missing,
                    'Unique Values':
                    unique,
                    'Most Frequent':
                    top,
                    'Top Frequency':
                    freq,
                    'Min':
                    col.min(),
                    'Max':
                    col.max(),
                    'Types Percent':
                    str(col_types_percent_detailed)
                })
                if most_detected == TypeDetector.GEO_GPS:
                    self.gps_cols.append(col_name)
                # Save column metadata
                # Only makes sense to save these numeric descriptors if is a lat, lon
                if most_detected == TypeDetector.GEO_GPS_LATLON:
                    self.add_column_metadata(col_name, 'Type Details', 'min',
                                             vmin)
                    self.add_column_metadata(col_name, 'Type Details', 'max',
                                             vmax)
                    self.add_column_metadata(col_name, 'Type Details', 'std',
                                             std)
                    self.add_column_metadata(col_name, 'Type Details', 'mean',
                                             mean)

            ########## Numeric  ##########
            elif data_type is TypeDetector.NUMERIC:
                printv(
                    "Processing Column {0}: {1} - Numeric".format(
                        i + 1, col_names[i]), str(i + 1))
                # printv('    Get valid Numeric gps_data', '.')
                col_data_numeric = TypeDetector.get_numeric_data(col)
                printv('    Processing Count', '.')
                count = col_data_numeric.count()
                missing = total_rows - count
                total_missing_values += missing
                if count > 0:
                    # printv('    Processing Mean', '.')
                    mean = np.mean(col_data_numeric)
                    # printv('    Processing Std', '.')
                    std = np.std(col_data_numeric)
                    # printv('    Processing Min', '.')
                    vmin = np.min(col_data_numeric)
                    # printv('    Processing Max', '.')
                    vmax = np.max(col_data_numeric)
                    # printv('    Processing Unique', '.')
                    unique = col_data_numeric.nunique()
                    value_counts = col_data_numeric.value_counts()

                    top = value_counts.keys()[0]
                    freq = value_counts[top]

                    # Histogram Data, default bins=10
                    num_bins = min([Profiler.MAX_BINS, unique])
                    hist_counts, bins = np.histogram(col_data_numeric,
                                                     bins=num_bins)
                    hist = pd.Series(hist_counts, index=bins[:-1])
                    hist_json = hist.to_json()
                else:
                    printv('    All NaN values')
                    hist_json = mean = std = vmin = vmax = unique = freq = None

                num_cols_names.append(col_name)
                num_cols.append({
                    'Count':
                    count,
                    'Missing':
                    missing,
                    'Mean':
                    mean,
                    'Std':
                    std,
                    'Min':
                    vmin,
                    'Max':
                    vmax,
                    'Unique Values':
                    unique,
                    'Most Frequent':
                    top,
                    'Top Frequency':
                    freq,
                    'Types Percent':
                    str(col_types_percent_detailed),
                    'Histogram Data JSON':
                    hist_json,
                })
                # Save column metadata
                self.add_column_metadata(col_name, 'Type Details', 'min', vmin)
                self.add_column_metadata(col_name, 'Type Details', 'max', vmax)
                self.add_column_metadata(col_name, 'Type Details', 'std', std)
                self.add_column_metadata(col_name, 'Type Details', 'mean',
                                         mean)
                self.add_column_metadata(col_name, 'Type Details',
                                         'Histogram Data JSON', hist_json)

            ########## Temporal  ##########
            elif data_type is TypeDetector.TEMPORAL:  # data_type is PandasUtils.Temporal:
                printv(
                    "Processing Column {0}: {1} - Temporal".format(
                        i + 1, col_names[i]), str(i + 1))
                # printv('    Processing Info', '.')
                info = col.astype(str).describe()
                temp_cols_names.append(col_name)
                printv('    Processing Count', '.')
                count = info['count']
                missing = total_rows - count
                total_missing_values += missing
                # printv('    Processing Lenght', '.')
                lenghts = col.str.len()
                len_min = lenghts.min()
                len_max = lenghts.max()
                len_mean = lenghts.mean()
                len_std = lenghts.std()
                top = None
                if 'top' in info.keys(): top = info['top'].__repr__().strip()
                freq = None
                if 'freq' in info.keys(): freq = info['freq']
                # printv('    Processing Min', '.')
                vmin = col.min()
                # printv('    Processing Max', '.')
                vmax = col.max()
                unique = info['unique']
                value_counts = valid_col_values.value_counts()
                temp_cols.append({
                    'Count': count,
                    'Missing': missing,
                    'Unique Values': unique,
                    'Most Frequent': top,
                    'Top Frequency': freq,
                    'Min': vmin,
                    'Max': vmax,
                    #                    'Lenght Min':    '{0:.0f}'.format(len_min),
                    #                    'Lenght Max':    '{0:.0f}'.format(len_max),
                    #                    'Lenght Mean':   '{0:.2f}'.format(len_mean),
                    #                    'Lenght Std':    '{0:.2f}'.format(len_std),
                    'Types Percent': str(col_types_percent)
                })
                # Save column metadata
                self.add_column_metadata(col_name, 'Type Details', 'min', vmin)
                self.add_column_metadata(col_name, 'Type Details', 'max', vmax)
                self.add_column_metadata(col_name, 'Type Details', 'std', std)
                self.add_column_metadata(col_name, 'Type Details', 'mean',
                                         mean)
#
########## Textual  ##########
            elif data_type is TypeDetector.TEXTUAL:
                printv(
                    "Processing Column {0}: {1} - Text".format(
                        i + 1, col_names[i]), str(i + 1))
                #printv('    Processing Info', '.')
                info = col.astype(str).describe()
                text_cols_names.append(col_name)
                #printv('    Processing Count', '.')
                count = info['count']
                missing = total_rows - count
                total_missing_values += missing
                #printv('    Processing Lenght', '.')
                lenghts = col.astype(str).str.len()
                length_min = lenghts.min()
                length_max = lenghts.max()
                length_mean = lenghts.mean()
                length_std = lenghts.std()
                App.debug('Counting words...')
                word_counts = col.astype(str).apply(lambda x: len(x.split())
                                                    if x is not None else 0)
                word_count_min = word_counts.min()
                word_count_max = word_counts.max()
                word_count_std = word_counts.std()
                word_count_mean = word_counts.mean()
                top = None
                if 'top' in info.keys(): top = info['top'].__repr__().strip()
                freq = None
                if 'freq' in info.keys(): freq = info['freq']

                unique = info['unique']
                text_cols.append({
                    'Count':
                    count,
                    'Missing':
                    missing,
                    'Unique Values':
                    unique,
                    'Most Frequent':
                    top,
                    'Top Frequency':
                    freq,
                    'Lenght Min':
                    '{0:.0f}'.format(length_min),
                    'Lenght Max':
                    '{0:.0f}'.format(length_max),
                    'Lenght Mean':
                    '{0:.2f}'.format(length_mean),
                    'Lenght Std':
                    '{0:.2f}'.format(length_std),
                    'Word Count Min':
                    word_count_min,
                    'Word Count Max':
                    word_count_max,
                    'Word Count Std':
                    word_count_std,
                    'Word Count Mean':
                    word_count_mean,
                    'Types Percent':
                    str(col_types_percent_detailed)
                })
                # Save Column Metadata
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-min', length_min)
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-max', length_max)
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-std', length_std)
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-mean', length_mean)
                self.add_column_metadata(col_name, 'Type Details', 'words-min',
                                         word_count_min)
                self.add_column_metadata(col_name, 'Type Details', 'words-max',
                                         word_count_max)
                self.add_column_metadata(col_name, 'Type Details',
                                         'words-mean', word_count_mean)
                self.add_column_metadata(col_name, 'Type Details', 'words-std',
                                         word_count_std)
                value_counts = valid_col_values.value_counts()

            else:  # data_type is TypeDetector.NULL:
                printv(
                    "Processing Column {0}: {1} - {2}".format(
                        i + 1, col_names[i], data_type), str(i + 1))
                # printv('    Processing Info', '.')
                info = col.astype(str).describe()
                null_cols_names.append(col_name)
                # printv('    Processing Count', '.')
                count = info['count']
                missing = len(col) - col.count()
                total_missing_values += missing
                # printv('    Processing Lenght', '.')
                lenghts = col.astype(str).apply(lambda x: len(x))
                length_min = lenghts.min()
                length_max = lenghts.max()
                length_mean = lenghts.mean()
                length_std = lenghts.std()
                top = None
                if 'top' in info.keys(): top = info['top'].__repr__().strip()
                freq = None
                if 'freq' in info.keys(): freq = info['freq']

                unique = info['unique']
                null_cols.append({
                    'Count':
                    count,
                    'Missing':
                    missing,
                    'Unique Values':
                    unique,
                    'Most Frequent':
                    top,
                    'Top Frequency':
                    freq,
                    'Lenght Min':
                    '{0:.0f}'.format(length_min),
                    'Lenght Max':
                    '{0:.0f}'.format(length_max),
                    'Lenght Mean':
                    '{0:.2f}'.format(length_mean),
                    'Lenght Std':
                    '{0:.2f}'.format(length_std),
                    'Types Percent':
                    str(col_types_percent_detailed)
                })
                # Save Column Metadata
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-min', length_min)
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-max', length_max)
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-std', length_std)
                self.add_column_metadata(col_name, 'Type Details',
                                         'length-mean', length_mean)
                # print 'valid_col_values[:10]=', valid_col_values[:10]
                # if len(valid_col_values) > 0:
                value_counts = valid_col_values.value_counts()
                # else:
                #     value_counts = {}

            # #Add column info
            column_data = {
                'database-id': self.db_name,
                'column-name': col_name,
                'socrata-type': metadata_type,
                'profiler-type': data_type,
                'profiler-most-detected_%': precision,
                'profiler-most-detected': most_detected,
                'unique': unique,
                'missing': missing,
                'values': count,
            }
            # General
            self.add_column_metadata(col_name, 'General', 'top-value', top)
            self.add_column_metadata(col_name, 'General', 'top-freq', freq)
            #            self.add_column_metadata(col_name, 'General', 'profiler-most-detected_%', precision)
            #            self.add_column_metadata(col_name, 'General', 'profiler-most-detected', most_detected)

            # Add column index to column metadata
            if ProfilerUtils.COLUMN_INDEXES in metadata and col_name in metadata[
                    ProfilerUtils.COLUMN_INDEXES]:
                self.add_column_metadata(
                    col_name, 'General', 'index',
                    metadata[ProfilerUtils.COLUMN_INDEXES][col_name])

            if value_counts is not None:
                top_k = {}
                unique_values = len(value_counts)
                limit = min(Profiler.MAX_TOP_K, unique_values)
                if self.part:
                    limit = unique_values
                for k in value_counts.keys()[:limit]:
                    # top_k[str(k)] = value_counts[k]
                    top_k[str(k)] = value_counts[k]  # TODO: Fix a bug here
                self.add_column_metadata(col_name, 'Type Details', 'top-k',
                                         top_k)

            # Simple type info
            for k in col_types_percent.keys():
                self.add_column_metadata(col_name, 'Simple Type', k,
                                         col_types_percent[k])
            # Complete type info
            for k in col_types_percent_detailed.keys():
                self.add_column_metadata(col_name, 'Detailed Type', k,
                                         col_types_percent_detailed[k])

            self.types_summary = self.types_summary.append(column_data,
                                                           ignore_index=True)

# ==============================================================================
# ============================ SUMMARYZE gps_data ==================================
# ==============================================================================
# Sumary DataFrames
        self.numeric_DataFrame = pd.DataFrame(num_cols, index=num_cols_names)
        self.geo_DataFrame = pd.DataFrame(geo_cols, index=geo_cols_names)
        self.textual_DataFrame = pd.DataFrame(text_cols, index=text_cols_names)
        self.null_DataFrame = pd.DataFrame(null_cols, index=null_cols_names)
        self.temporal_DataFrame = pd.DataFrame(temp_cols,
                                               index=temp_cols_names)

        time_end = time.time()
        readable_time_end = TimeUtils.current_time_formated()

        # Database Sumary
        total_values = total_rows * col_names.size
        missing_percent = (total_missing_values * 100.0) / (total_values)
        missing_percent = '{0:.2f}'.format(missing_percent)

        processing_time = '{0:.2f}'.format(time_end - time_start)
        printv('\n\n=============== DATABASE SUMARY ===============')
        printv('File: {0}'.format(self.db_name))
        printv('Rows: {0:n}'.format(total_rows))
        printv('Columns: {0:n}'.format(col_names.size))
        printv('  - Geo: {0:n}'.format(len(geo_cols_names)))
        printv('  - Temporal: {0:n}'.format(len(temp_cols_names)))
        printv('  - Numeric: {0:n}'.format(len(num_cols_names)))
        printv('  - Textual: {0:n}'.format(len(text_cols_names)))
        printv('Values')
        printv('  - Total:   {0:n} (Rows x Columns)'.format(total_values))
        printv('  - Missing: {0:n} ({1}%)'.format(total_missing_values,
                                                  missing_percent))
        printv('Processing time: {0} sec'.format(processing_time))

        used_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
        null_cols = col_names.size - len(num_cols_names) - len(
            geo_cols_names) - len(temp_cols_names) - len(text_cols_names)
        self.last_sumary = pd.DataFrame(
            [{
                'Name': self.db_name,
                'Rows': total_rows,
                'Columns': col_names.size,
                'Columns Numeric': len(num_cols_names),
                'Columns Temporal': len(temp_cols_names),
                'Columns Geo': len(geo_cols_names),
                'Columns Text': len(text_cols_names),
                'Columns Null': len(null_cols_names),
                'Column Names Numeric': str(num_cols_names),
                'Column Names Geo': str(geo_cols_names),
                'Column Names Text': str(text_cols_names),
                'Column Names Temporal': str(temp_cols_names),
                'Values': total_values,
                'Values Missing': total_missing_values,
                'Values Missing Percent': missing_percent,
                'ETL-Profiler Processing Time (sec)': processing_time,
                'ETL-Profiler Time Begin': readable_time_start,
                'ETL-Profiler Time End': readable_time_end,
                'ETL-Profiler Input File': file_name,
                'ETL-Profiler Input File Size (KB)': metadata['file_size'],
                'ETL-Profiler Total Memory (MB)': used_memory,
                Profiler.STATUS: Profiler.STATUS_SUCCESS
            }],
            columns=Profiler.SUMARY_COLUMNS,
            index=[self.db_name])

        if self.ignore_metadata:
            printv('=============== Metadata Ignored ===============')
        elif metadata is not None and MetadataUtils.has_success(metadata):
            try:
                printv('=============== PROVIDED METADATA ===============')
                for key in metadata.keys():
                    if key == MetadataConstants.TYPES:
                        continue  #Ignore Provided Types in dataset metadata

                    value = metadata[key]
                    App.debug(key, '=', value, ' - type:', type(value))
                    self.last_sumary[key] = value.__str__()
                self.last_sumary[MetadataConstants.
                                 PRIMARY] = MetadataUtils.is_primary(metadata)
                printv('')
            except (UnicodeEncodeError) as ex:
                #do nothing
                printv('UnicodeEncodeError with socrata metadata.')
                if self.stop_on_error: raise ex

        # ==============================================================================
        # ============================ Process Geo gps_data ====================================
        # ==============================================================================
        if self.ignore_index:
            printv('Ignoring geo-temp index')
        else:
            printv('Processing Geographic gps_data')
            self.generate_index(database)

# ==============================================================================
# ============================ SHOW RESULTS ====================================
# ==============================================================================
        if self.show_details:
            if self.show_all_columns:
                numeric_info_to_show = self.numeric_DataFrame.columns.tolist()

                text_info_to_show = self.textual_DataFrame.columns.tolist()
                if 'Value Counts' in text_info_to_show:
                    text_info_to_show.remove('Value Counts')

                geo_info_to_show = self.geo_DataFrame.columns.tolist()
                if 'Value Counts' in geo_info_to_show:
                    geo_info_to_show.remove('Value Counts')

                temporal_info_to_show = self.temporal_DataFrame.columns.tolist(
                )
                if 'Value Counts' in temporal_info_to_show:
                    temporal_info_to_show.remove('Value Counts')
            else:
                numeric_info_to_show = Profiler.NUMERIC_INFO_SMALL
                text_info_to_show = Profiler.TEXT_INFO_SMALL
                geo_info_to_show = Profiler.GEO_INFO_SMALL
                temporal_info_to_show = Profiler.TEMPORAL_INFO_SMALL
            null_info_to_show = Profiler.TEMPORAL_INFO_SMALL

            if len(geo_cols) > 0:
                print '\n=============== Geo gps_data Sumary:'
                #                print self.geo_DataFrame[geo_info_to_show]
                #                for count, row in self.geo_DataFrame[geo_info_to_show].iterrows():
                #                    print '>>>>>>>>>>>>>>>>>>>>', row
                print self.geo_DataFrame[geo_info_to_show]
            if len(num_cols) > 0:
                print '=============== Numeric Sumary:'
                print self.numeric_DataFrame[numeric_info_to_show]
            if len(temp_cols) > 0:
                print '\n=============== Temporal Sumary:'
                print self.temporal_DataFrame[temporal_info_to_show]
            if len(text_cols) > 0:
                print '\n=============== Textual Sumary:'
                print self.textual_DataFrame[text_info_to_show]
            if len(null_cols_names) > 0:
                print '\n=============== Null  Sumary:'
                print self.null_DataFrame[null_info_to_show]

            printv(
                '\n========================================================= Types Information:'
            )
            printv('        --- Complete ---')
            printv(str_cols_types_percent.rstrip('\n'))
            printv(
                '\n        ------------------- Types Summary ------------------- '
            )
            printv(self.types_summary)
            printv(
                '==============================================================='
            )
            printv(self.last_sumary.T)
#            printv('\n        ------------------- Column Metadata  ------------------- ')
#            printv( '===============================================================')
#            printv (self.column_metadata)

        printv(
            '===============================================================')

        # ==============================================================================
        # ================================ SAVE FILES ==================================
        # ==============================================================================
        if self.save_output_files:
            print 'Generated Files:'
            #            self.save_dataframe_to_csv(self.numeric_DataFrame, '_profiled_numeric')
            #            self.save_dataframe_to_csv(self.geo_DataFrame, '_profiled_geo')
            #            self.save_dataframe_to_csv(self.textual_DataFrame, '_profiled_textual')
            #            self.save_last_sumary_to_json()
            #            filename = self.to_folder + self.db_name + '_types.json'
            #            self.types_summary.to_csv()

            if self.last_zip_rows is None: print ' No Zip file to save.'
            else:
                filename = self.to_folder + self.db_name + '_zip.csv'
                self.last_zip_rows.to_csv(filename)
                print '    ' + filename

            if self.last_gps_rows is None: print ' No GPS file to save.'
            else:
Пример #27
0
def reduce_summaries(first, second):
    """
    This method joins two profile summaries in one that has information from both.
    If more than two summaries should be joined, join two by two.

    This method relies on naming convention for the variables to know how to join their values.
    For example, if the variable is count, then is just sum both counts.
    However if the variable is unique, then to join we need to consider the sets of values.
    Other examples are: sum, std, min, max.

    :param first: a summary to be joined
    :param second: another summary to be joined
    :return: a joined summary
    """
    # Init
    reprocess_column_types = False
    # print '\n\n------------------------------------------- reduce -------------------------------------------'
    # print '1st =>', first
    # print '\n2nd =>', second

    # return '(' + first + ' <_> ' + second + ')'

    # Verify if structure is the same and dataset too
    if first['Name'] != second['Name']:
        raise Exception('Summaries are not from the same dataset.')
    elif first['Columns'] != second['Columns']:
        raise Exception('Number of columns is not the same.')

    joined = {}
    # we'll assume both summaries have the same keys
    # TODO: Protect to when both don't have the same keys
    all_keys = first.keys()
    # all_keys = [TextUtils.reencode_if_not_ascii(k) for k in first.keys()]

    # Join values based on key convention names or specific keys
    App.info('Processing all keys: %s' % all_keys)
    for key in all_keys:
        App.debug('- kEY: %s' % key)
        if key.lower().endswith('min') or key.lower().endswith('begin'):
            joined[key] = min(first[key], second[key])

        elif key.lower().endswith('max') or key.lower().endswith('end'):
            joined[key] = max(first[key], second[key])

        # if the keys are not max, min, std, mean or unique just use first -- we`re assuming both are the same
        # After we join the dataset metadata we still have to join the column metadata
        elif key == Profiler.COLUMN_METADATA:
            joined['Column Metadata'] = reduce_column_metadata(first, second)

        # TODO: join geo-temp index
        # elif key == 'Geo-Temp Index':

        elif key in [
                'Rows', 'Values', 'Values Missing',
                'ETL-Profiler Processing Time (sec)',
                'ETL-Profiler Total Memory (MB)', 'GPS Values'
        ]:
            joined[key] = first[key] + second[key]

        elif key in ['Values Missing Percent']:
            total = int(first['Rows']) + int(second['Rows'])
            temp = (float(first[key]) * int(first['Rows']) +
                    float(second[key]) * int(second['Rows'])) / total
            joined[key] = round(temp, 2)

        elif key in [
                'Column Names Geo', 'Column Names Numeric',
                'Column Names Temporal', 'Column Names Text',
                'Columns Names Null'
        ]:
            if first[key] == second[key]:
                joined[key] = first[key]
            else:
                # TODO: should be processed later based on column types
                reprocess_column_types = True
        else:
            App.debug('    first["%s"]= %s' % (key, first[key]))
            if first[key]:
                joined[key] = first[key]
            else:
                App.debug('    -> Ignoring null Value')

    return joined