Python clean 예제들, scdata.utils.clean Python 예제들

예제 #1

0

파일 보기

파일: device_api.py 프로젝트: antoine3000/smartcitizen-data

    def post_device_data(self, df, sensor_id, clean_na = 'drop'):
        '''
            POST data in the SmartCitizen API
            Parameters
            ----------
                df: pandas DataFrame
                    Contains data in a DataFrame format. 
                    Data is posted regardless the name of the dataframe
                    It uses the sensor id provided, not the name
                    Data is posted in UTC TZ so dataframe needs to have located 
                    timestamp
                sensor_id: int
                    The sensor id
                clean_na: string, optional
                    'drop'
                    'drop', 'fill'
            Returns
            -------
                True if the data was posted succesfully
        '''
        if 'SC_ADMIN_BEARER' not in environ:
            std_out('Cannot post without Auth Bearer', 'ERROR')
            return

        headers = {'Authorization':'Bearer ' + environ['SC_ADMIN_BEARER'], 'Content-type': 'application/json'}

        # Get sensor name
        sensor_name = list(df.columns)[0]
        # Clean df of nans
        df = clean(df, clean_na, how = 'all')

        # Process dataframe
        df['id'] = sensor_id
        df.index.name = 'recorded_at'
        df.rename(columns = {sensor_name: 'value'}, inplace = True)
        df.columns = MultiIndex.from_product([['sensors'], df.columns])
        j = (df.groupby('recorded_at', as_index = True)
                .apply(lambda x: x['sensors'][['value', 'id']].to_dict('r'))
        )

        # Prepare json post
        payload = {"data":[]}
        for item in j.index:
            payload["data"].append(
                {
                    "recorded_at": localise_date(item, 'UTC').strftime('%Y-%m-%dT%H:%M:%SZ'),
                    "sensors": j[item]
                }
            )

        payload_json = dumps(payload)

        response = post(f'https://api.smartcitizen.me/v0/devices/{self.id}/readings', data = payload_json, headers = headers)
        if response.status_code == 200 or response.status_code == 201:
            return True

        return False

예제 #2

0

파일 보기

파일: prepare.py 프로젝트: fablabbcn/smartcitizen-data

def prepare(self, measurand, inputs, options=dict()):
    """
    Prepares a test for a regression model
    Parameters
    ----------
        measurand: dict
            measurand = {'8019043': ['NO2']}
        inputs: dict
            inputs per device and reading
                inputs = {'devicename': ['reading-1', 'reading-2']}
        options: dict
            Options including data processing. Defaults in config._model_def_opt
    Returns
    -------
        df = pandas Dataframe
        measurand_name = string
    """

    options = dict_fmerge(options, config._model_def_opt)

    # Measurand
    measurand_device = list(measurand.keys())[0]
    measurand_metric = measurand[measurand_device][0]
    measurand_name = measurand[measurand_device][0] + '_' + measurand_device

    df = DataFrame()
    df[measurand_name] = self.devices[measurand_device].readings[
        measurand_metric]

    for input_device in inputs.keys():
        combined_df = self.combine(devices=[input_device],
                                   readings=inputs[input_device])
        df = df.combine_first(combined_df)

    if options['common_avg']:

        common_channels = inputs[list(inputs.keys())[0]]
        for input_device in inputs.keys():
            common_channels = list(
                set(common_channels).intersection(set(inputs[input_device])))
        std_out(f'Performing avg in common columns {common_channels}')
        for channel in common_channels:
            columns_list = [
                channel + '_' + device for device in list(inputs.keys())
            ]
            df[channel + '_AVG'] = df[columns_list].mean(axis=1)

        df = df.loc[:,
                    df.columns.str.contains("_AVG")
                    | df.columns.str.contains(measurand_name)]

    if options['clean_na'] is not None:
        df = clean(df, options['clean_na'], how='any')

    return df, measurand_name

예제 #3

0

파일 보기

def apply_regressor(dataframe, **kwargs):
    '''
	Applies a regressor model based on a pretrained model
	Parameters
    ----------
    	model: sklearn predictor
    		Model with .predict method
		options: dict
			Options for data preprocessing. Defaults in config.model_def_opt
		variables: dict
			variables dictionary with:
				{
				'measurand': {
								'measurand-device-name': ['measurand']
								},
					'inputs': {'input-device-names': ['input-1', 'input_2', 'input-3']
					 			}
					}
    Returns
    ----------
    pandas series containing the prediction
	'''

    inputs = list()
    for device in kwargs['variables']['inputs']:
        inputs = list(
            set(inputs).union(set(kwargs['variables']['inputs'][device])))

    try:
        inputdf = dataframe[inputs].copy()
        inputdf = inputdf.reindex(sorted(inputdf.columns), axis=1)
    except KeyError:
        std_out('Inputs not in dataframe', 'ERROR')
        pass
        return None

    if 'model' not in kwargs:
        std_out('Model not in inputs', 'ERROR')
    else:
        model = kwargs['model']

    if 'options' not in kwargs:
        options = config.model_def_opt
    else:
        options = dict_fmerge(config.model_def_opt, kwargs['options'])

    # Remove na
    inputdf = clean(inputdf, options['clean_na'], how='any')

    features = array(inputdf)
    result = DataFrame(model.predict(features)).set_index(inputdf.index)

    return result

예제 #4

0

파일 보기

파일: device_api.py 프로젝트: antoine3000/smartcitizen-data

    def get_device_data(self, start_date = None, end_date = None, frequency = '3Min', clean_na = None):

        if start_date is not None: days_ago = (to_datetime(date.today())-to_datetime(start_date)).days
        else: days_ago = 365 # One year of data

        std_out(f'Requesting data from MUV API')
        std_out(f'Device ID: {self.id}')
        self.get_device_location()
        self.get_device_sensors()        
        
        # Get devices
        try:
            if days_ago == -1: url = f'{self.API_BASE_URL}getSensorData?sensor_id={self.id}'            
            else: url = f'{self.API_BASE_URL}getSensorData?sensor_id={self.id}&days={days_ago}'
            df = DataFrame(get(url).json())
        except:
            print_exc()
            std_out('Failed sensor request request. Probably no connection', 'ERROR')
            pass
            return None

        try:
            # Rename columns
            df.rename(columns = self.sensors, inplace = True)
            df = df.set_index('time')

            df.index = localise_date(df.index, self.location)
            df = df[~df.index.duplicated(keep='first')]
            # Drop unnecessary columns
            df.drop([i for i in df.columns if 'Unnamed' in i], axis=1, inplace=True)
            df.drop('id', axis=1, inplace=True)
            # Check for weird things in the data
            df = df.apply(to_numeric, errors='coerce')
            # # Resample
            df = df.resample(frequency).mean()
            df = df.reindex(df.index.rename('Time'))

            df = clean(df, clean_na, how = 'all')
                
            self.data = df
                
        except:
            print_exc()
            std_out('Problem closing up the API dataframe', 'ERROR')
            pass
            return None

        std_out(f'Device {self.id} loaded successfully from API', 'SUCCESS')
        return self.data

예제 #5

0

파일 보기

def ts_dendrogram(self, **kwargs):
    """
    Plots dendrogram of devices and channels in matplotlib plot. Takes all the channels 
    in channels that are in the test `devices`
    Parameters
    ----------
        devices: list or string
            'all'
            If 'all', uses all devices in the test
        channels: list
            'all'
            If 'all', uses all channels in the devices        
        metric: string
            'correlation' for normal R2 or custom metric by callable
        'method': string
            'single'
            Method for dendrogram
        'options': dict
            Options including data processing prior to plot. Defaults in config._plot_def_opt
        formatting: dict
            Name of auxiliary electrode found in dataframe. Defaults in config._ts_plot_def_fmt
    Returns
    -------
        Dendrogram matrix, shows plot
    """
    if 'metric' not in kwargs: metric = 'correlation'
    else: metric = kwargs['metric']

    if 'method' not in kwargs: method = 'single'
    else: method = kwargs['method']

    if 'devices' not in kwargs: devices = list(self.devices.keys())
    else: devices = kwargs['devices']

    if 'channels' not in kwargs: channels = 'all'
    else: channels = kwargs['channels']

    if 'options' not in kwargs:
        std_out('Using default options')
        options = config._plot_def_opt
    else:
        options = dict_fmerge(config._plot_def_opt, kwargs['options'])

    if 'formatting' not in kwargs:
        std_out('Using default formatting')
        formatting = config._ts_plot_def_fmt['mpl']
    else:
        formatting = dict_fmerge(config._ts_plot_def_fmt['mpl'],
                                 kwargs['formatting'])

    # Style
    if formatting['style'] is not None: style.use(formatting['style'])
    else: style.use(config._plot_style)

    # Palette
    if formatting['palette'] is not None: set_palette(formatting['palette'])

    # Size sanity check
    if formatting['width'] > 50:

        std_out('Reducing width to 12')
        formatting['width'] = 12

    if formatting['height'] > 50:

        std_out('Reducing height to 10')
        formatting['height'] = 10

    # Font size
    if formatting['fontsize'] is not None:
        rcParams.update({'font.size': formatting['fontsize']})

    df = DataFrame()

    for device in devices:
        dfd = self.devices[device].readings.copy()
        dfd = dfd.resample(options['frequency']).mean()

        if channels != 'all':
            for channel in channels:
                if channel in dfd.columns:
                    df = df.append(dfd[channel].rename(device + '_' + channel))
        else:
            df = df.append(dfd)

    df = clean(df, options['clean_na'], how='any')

    # if options['clean_na'] is not None:
    #     if options['clean_na'] == 'drop': df.dropna(axis = 1, inplace=True)
    #     if options['clean_na'] == 'fill': df = df.fillna(method='ffill')

    # Do the clustering
    Z = hac.linkage(df, method=method, metric=metric)

    # Plot dendogram
    plt.figure(figsize=(formatting['width'], formatting['height']))
    plt.title(formatting['title'], fontsize=formatting['titlefontsize'])
    plt.subplots_adjust(top=formatting['suptitle_factor'])
    plt.xlabel(formatting['xlabel'])
    plt.ylabel(formatting['ylabel'])
    hac.dendrogram(
        Z,
        orientation=formatting['orientation'],
        leaf_font_size=formatting[
            'fontsize'],  # font size for the x axis labels
        labels=df.index)

    plt.show()

    return Z

예제 #6

0

파일 보기

파일: csv.py 프로젝트: fablabbcn/smartcitizen-data

def read_csv_file(file_path,
                  timezone,
                  frequency,
                  clean_na=None,
                  index_name='',
                  skiprows=None,
                  sep=',',
                  encoding='utf-8',
                  tzaware=True):
    """
    Reads a csv file and adds cleaning, localisation and resampling and puts it into a pandas dataframe
    Parameters
    ----------
        file_path: String
            File path for csv file
        timezone: String
            Time zone for the csv file
        clean_na: String or None
            None
            Whether to perform clean_na or not. Either None, 'fill' or 'drop'
        index_name: String
            ''
            Name of the column to set an index in the dataframe
        skiprows: list or None
            None
            List of rows to skip (same as skiprows in pandas.read_csv)
        sep: String
            ','
            Separator (same as sep in pandas.read_csv)
        encoding: String
            'utf-8'
            Encoding of the csv file
    Returns
    -------
        Pandas dataframe
    """

    # Read pandas dataframe

    df = read_csv(file_path,
                  verbose=False,
                  skiprows=skiprows,
                  sep=sep,
                  encoding=encoding,
                  encoding_errors='ignore')

    flag_found = False
    if type(index_name) == str:
        # Single joint index
        for column in df.columns:
            if index_name in column:
                df = df.set_index(column)
                flag_found = True
                break
    elif type(index_name) == list:
        # Composite index (for instance, DATE and TIME in different columns)
        for iname in index_name:
            if iname not in df.columns:
                std_out(f'{iname} not found in columns', 'ERROR')
                return None
        joint_index_name = '_'.join(index_name)
        df[joint_index_name] = df[index_name].agg(' '.join, axis=1)
        df = df.set_index(joint_index_name)
        df.drop(index_name, axis=1, inplace=True)
        flag_found = True

    if not flag_found:
        std_out('Index not found. Cannot reindex', 'ERROR')
        return None

    # Set index
    df.index = localise_date(df.index, timezone, tzaware=tzaware)
    # Remove duplicates
    df = df[~df.index.duplicated(keep='first')]

    # Sort index
    df.sort_index(inplace=True)

    # Drop unnecessary columns
    df.drop([i for i in df.columns if 'Unnamed' in i], axis=1, inplace=True)

    # Check for weird things in the data
    # df = df.apply(to_numeric, errors='coerce')
    df = df.astype(float, errors='ignore')

    # Resample
    df = df.resample(frequency).mean()

    # Remove na
    df = clean(df, clean_na, how='all')

    return df

예제 #7

0

파일 보기

파일: csv.py 프로젝트: fablabbcn/smartcitizen-data

def sdcard_concat(path,
                  output='CONCAT.CSV',
                  index_name='TIME',
                  keep=True,
                  ignore=['CONCAT.CSV', 'INFO.TXT']):
    '''
        Loads files from local directory in text format, for instance
        SD card files with timestamp, sparse or concatenated
        Parameters
        ----------
            path: String
                Directory containing the folder
            output: String
                CONCAT.CSV
                Output name (csv file). If '' no output is saved, only
                returns a pandas.DataFrame()
            index_name: String
                'TIME'
                Name for the index of the pandas.DataFrame()
            keep: boolean
                True
                Keeps the header in the output file
            ignore: list
                CONCAT.CSV
                Ignores this file if present in the folder
        Returns
        -------
            Pandas dataframe
    '''

    concat = DataFrame()
    header_tokenized = dict()
    marked_for_revision = False

    for file in listdir(path):
        if file != output and file not in ignore:
            std_out(f'Loading file: {file}')
            filename, _ = splitext(file)
            src_path = join(path, file)

            try:
                with open(src_path, 'r', newline='\n',
                          errors='replace') as csv_file:
                    header = csv_file.readlines()[0:4]
            except:
                ignore_file = True
                std_out(f'Ignoring file: {file}', 'WARNING')
                pass
            else:
                ignore_file = False

            if ignore_file: continue

            if keep:
                short_tokenized = header[0].strip('\r\n').split(',')
                unit_tokenized = header[1].strip('\r\n').split(',')
                long_tokenized = header[2].strip('\r\n').split(',')
                id_tokenized = header[3].strip('\r\n').split(',')

                for item in short_tokenized:
                    if item != '' and item not in header_tokenized.keys():
                        index = short_tokenized.index(item)

                        header_tokenized[short_tokenized[index]] = dict()
                        header_tokenized[short_tokenized[index]][
                            'unit'] = unit_tokenized[index]
                        header_tokenized[short_tokenized[index]][
                            'long'] = long_tokenized[index]
                        header_tokenized[
                            short_tokenized[index]]['id'] = id_tokenized[index]

            temp = read_csv(src_path,
                            verbose=False,
                            skiprows=range(1, 4),
                            encoding_errors='ignore').set_index("TIME")
            temp = clean(temp, clean_na='drop', how='all')
            temp.index.rename(index_name, inplace=True)
            concat = concat.combine_first(temp)

    columns = concat.columns

    ## Sort index
    concat.sort_index(inplace=True)

    ## Save it as CSV
    if output.endswith('.CSV') or output.endswith('.csv'):
        concat.to_csv(join(path, output))

        if keep:
            print('Updating header')
            with open(join(path, output), 'r') as csv_file:
                content = csv_file.readlines()

                final_header = content[0].strip('\n').split(',')
                short_h = []
                units_h = []
                long_h = []
                id_h = []

                for item in final_header:
                    if item in header_tokenized.keys():
                        short_h.append(item)
                        units_h.append(header_tokenized[item]['unit'])
                        long_h.append(header_tokenized[item]['long'])
                        id_h.append(header_tokenized[item]['id'])

                content.pop(0)

                for index_content in range(len(content)):
                    content[index_content] = content[index_content].strip('\n')

                content.insert(0, ','.join(short_h))
                content.insert(1, ','.join(units_h))
                content.insert(2, ','.join(long_h))
                content.insert(3, ','.join(id_h))

            with open(join(path, output), 'w') as csv_file:
                print('Saving file to:', output)
                wr = csv.writer(csv_file, delimiter='\t')

                for row in content:
                    wr.writerow([row])

    return concat

예제 #8

0

파일 보기

파일: read_api.py 프로젝트: mcsage/smartcitizen-data-framework

    def get_device_data(self,
                        start_date=None,
                        end_date=None,
                        frequency='1H',
                        clean_na=None):
        '''
        Based on code snippet from Marc Roig:
        # I2CAT RESEARCH CENTER - BARCELONA - MARC ROIG ([email protected])
        '''

        std_out(f'Requesting data from Dades Obertes API')
        std_out(f'Device ID: {self.id}')
        self.get_device_sensors()
        self.get_device_location()

        request = self.API_BASE_URL
        request += f'codi_eoi={self.id}'

        if start_date is not None and end_date is not None:
            request += "&$where=data between " + to_datetime(start_date).strftime("'%Y-%m-%dT%H:%M:%S'") \
                    + " and " + to_datetime(end_date).strftime("'%Y-%m-%dT%H:%M:%S'")
        elif start_date is not None:
            request += "&$where=data >= " + to_datetime(start_date).strftime(
                "'%Y-%m-%dT%H:%M:%S'")
        elif end_date is not None:
            request += "&$where=data < " + to_datetime(end_date).strftime(
                "'%Y-%m-%dT%H:%M:%S'")

        try:
            s = get(request)
        except:
            print_exc()
            std_out('Problem with sensor data from API', 'ERROR')
            pass
            return None

        if s.status_code == 200 or s.status_code == 201:
            df = read_csv(StringIO(s.content.decode('utf-8')))
        else:
            std_out('API reported {}'.format(s.status_code), 'ERROR')
            pass
            return None

        # Filter columns
        measures = ['h0' + str(i) for i in range(1, 10)]
        measures += ['h' + str(i) for i in range(10, 25)]
        # validations = ['v0' + str(i) for i in range(1,10)]
        # validations  += ['v' + str(i) for i in range(10,25)]
        new_measures_names = list(range(1, 25))

        columns = ['contaminant', 'data'] + measures  # + validations
        try:
            df_subset = df[columns]
            df_subset.columns = ['contaminant', 'date'] + new_measures_names
        except:
            print_exc()
            std_out('Problem while filtering columns', 'Error')
            return None
        else:
            std_out('Successful filtering', 'SUCCESS')

        # Pivot
        try:
            df = DataFrame([])
            for contaminant in self.sensors.keys():
                if contaminant not in df_subset['contaminant'].values:
                    std_out(f'{contaminant} not in columns. Skipping',
                            'WARNING')
                    continue
                df_temp = df_subset.loc[
                    df_subset['contaminant'] == contaminant].drop(
                        'contaminant',
                        1).set_index('date').unstack().reset_index()
                df_temp.columns = ['hours', 'date', contaminant]
                df_temp['date'] = to_datetime(df_temp['date'])
                timestamp_lambda = lambda x: x['date'] + DateOffset(hours=int(
                    x['hours']))
                df_temp['date'] = df_temp.apply(timestamp_lambda, axis=1)
                df_temp = df_temp.set_index('date')
                df[contaminant] = df_temp[contaminant]
        except:
            # print_exc()
            std_out('Problem while filtering columns', 'Error')
            pass
            return None
        else:
            std_out('Successful pivoting', 'SUCCESS')

        df.index = to_datetime(df.index).tz_localize('UTC').tz_convert(
            self.location)
        df.sort_index(inplace=True)

        # Rename
        try:
            df.rename(columns=self.sensors, inplace=True)
        except:
            # print_exc()
            std_out('Problem while renaming columns', 'Error')
            pass
            return None
        else:
            std_out('Successful renaming', 'SUCCESS')

        # Clean
        df = df[~df.index.duplicated(keep='first')]
        # Drop unnecessary columns
        df.drop([i for i in df.columns if 'Unnamed' in i],
                axis=1,
                inplace=True)
        # Check for weird things in the data
        df = df.apply(to_numeric, errors='coerce')
        # Resample
        df = df.resample(frequency).mean()

        try:
            df = df.reindex(df.index.rename('Time'))

            df = clean(df, clean_na, how='all')
            # if clean_na is not None:
            #     if clean_na == 'drop':
            #         # std_out('Cleaning na with drop')
            #         df.dropna(axis = 0, how='all', inplace=True)
            #     elif clean_na == 'fill':
            #         df = df.fillna(method='bfill').fillna(method='ffill')
            #         # std_out('Cleaning na with fill')
            self.data = df

        except:
            std_out('Problem closing up the API dataframe', 'ERROR')
            pass
            return None

        std_out(f'Device {self.id} loaded successfully from API', 'SUCCESS')
        return self.data

예제 #9

0

파일 보기

파일: read_api.py 프로젝트: mcsage/smartcitizen-data-framework

    def get_device_data(self,
                        start_date=None,
                        end_date=None,
                        frequency='1Min',
                        clean_na=None):

        std_out(f'Requesting data from SC API')
        std_out(f'Device ID: {self.id}')

        rollup = self.convert_rollup(frequency)
        std_out(f'Using rollup: {rollup}')

        # Make sure we have the everything we need beforehand
        self.get_device_sensors()
        self.get_device_location()
        self.get_device_last_reading()
        self.get_device_added_at()
        self.get_kit_ID()

        if self.location is None: return None

        # Check start date
        # if start_date is None and self.added_at is not None:
        #     start_date = localise_date(to_datetime(self.added_at, format = '%Y-%m-%dT%H:%M:%SZ'), self.location)
        #     # to_datetime(self.added_at, format = '%Y-%m-%dT%H:%M:%SZ')
        # elif start_date is not None:
        #     start_date = to_datetime(start_date, format = '%Y-%m-%dT%H:%M:%SZ')
        if start_date is not None:
            start_date = localise_date(
                to_datetime(start_date, format='%Y-%m-%dT%H:%M:%SZ'),
                self.location)

            # if start_date.tzinfo is None: start_date = start_date.tz_localize('UTC').tz_convert(self.location)
            std_out(f'Min Date: {start_date}')

        # # Check end date
        # if end_date is None and self.last_reading_at is not None:
        #     # end_date = to_datetime(self.last_reading_at, format = '%Y-%m-%dT%H:%M:%SZ')
        #     end_date = localise_date(to_datetime(self.last_reading_at, format = '%Y-%m-%dT%H:%M:%SZ'), self.location)
        # elif end_date is not None:
        #     end_date = to_datetime(end_date, format = '%Y-%m-%dT%H:%M:%SZ')
        if end_date is not None:
            end_date = localise_date(
                to_datetime(end_date, format='%Y-%m-%dT%H:%M:%SZ'),
                self.location)

            # if end_date.tzinfo is None: end_date = end_date.tz_localize('UTC').tz_convert(self.location)

            std_out(f'Max Date: {end_date}')

        # if start_date > end_date: std_out('Ignoring device dates. Probably SD card device', 'WARNING')

        # Print stuff
        std_out('Kit ID: {}'.format(self.kit_id))
        # if start_date < end_date: std_out(f'Dates: from: {start_date}, to: {end_date}')
        std_out(f'Device timezone: {self.location}')
        if not self.sensors.keys():
            std_out(f'Device is empty')
            return None
        else:
            std_out(f'Sensor IDs: {list(self.sensors.keys())}')

        df = DataFrame()

        # Get devices in the sensor first
        for sensor_id in self.sensors.keys():

            # Request sensor per ID
            request = self.API_BASE_URL + '{}/readings?'.format(self.id)

            if start_date is None:
                request += 'from=2001-01-01'
            elif end_date is not None:
                if start_date > end_date: request += 'from=2001-01-01'
                else:
                    request += f'from={start_date}'
                    request += f'&to={end_date}'

            request += f'&rollup={rollup}'
            request += f'&sensor_id={sensor_id}'
            request += '&function=avg'
            # if end_date is not None:
            #     if end_date > start_date: request += f'&to={end_date}'

            # Make request
            sensor_req = get(request)
            flag_error = False
            try:
                sensorjson = sensor_req.json()
            except:
                print_exc()
                std_out('Problem with json data from API', 'ERROR')
                flag_error = True
                pass
                continue

            if 'readings' not in sensorjson.keys():
                std_out(f'No readings key in request for sensor: {sensor_id}',
                        'ERROR')
                flag_error = True
                continue

            elif sensorjson['readings'] == []:
                std_out(f'No data in request for sensor: {sensor_id}',
                        'WARNING')
                flag_error = True
                continue

            if flag_error: continue

            # Put
            try:
                dfsensor = DataFrame(sensorjson['readings']).set_index(0)
                dfsensor.columns = [self.sensors[sensor_id]]
                # dfsensor.index = to_datetime(dfsensor.index).tz_localize('UTC').tz_convert(self.location)
                dfsensor.index = localise_date(dfsensor.index, self.location)
                dfsensor.sort_index(inplace=True)
                dfsensor = dfsensor[~dfsensor.index.duplicated(keep='first')]

                # Drop unnecessary columns
                dfsensor.drop([i for i in dfsensor.columns if 'Unnamed' in i],
                              axis=1,
                              inplace=True)
                # Check for weird things in the data
                dfsensor = dfsensor.apply(to_numeric, errors='coerce')
                # Resample
                dfsensor = dfsensor.resample(frequency).mean()

                df = df.combine_first(dfsensor)
            except:
                print_exc()
                std_out('Problem with sensor data from API', 'ERROR')
                flag_error = True
                pass
                continue

            try:
                df = df.reindex(df.index.rename('Time'))

                df = clean(df, clean_na, how='all')
                # if clean_na is not None:
                #     if clean_na == 'drop':
                #         # std_out('Cleaning na with drop')
                #         df.dropna(axis = 0, how='all', inplace=True)
                #     elif clean_na == 'fill':
                #         df = df.fillna(method='bfill').fillna(method='ffill')
                #         # std_out('Cleaning na with fill')
                self.data = df

            except:
                std_out('Problem closing up the API dataframe', 'ERROR')
                pass
                return None

        if flag_error == False:
            std_out(f'Device {self.id} loaded successfully from API',
                    'SUCCESS')
        return self.data

예제 #10

0

파일 보기

def read_csv_file(file_path, location, frequency, clean_na = None, index_name = '', skiprows = None, sep = ',', encoding = 'utf-8'):
    """
    Reads a csv file and adds cleaning, localisation and resampling and puts it into a pandas dataframe
    Parameters
    ----------
        file_path: String
            File path for csv file
        location: String
            Time zone for the csv file
        clean_na: String or None
            None
            Whether to perform clean_na or not. Either None, 'fill' or 'drop'
        index_name: String
            ''
            Name of the column to set an index in the dataframe
        skiprows: list or None
            None
            List of rows to skip (same as skiprows in pandas.read_csv)
        sep: String
            ','
            Separator (same as sep in pandas.read_csv)
        encoding: String
            'utf-8'
            Encoding of the csv file
    Returns
    -------
        Pandas dataframe
    """  

    # Read pandas dataframe
    df = read_csv(file_path, verbose = False, skiprows = skiprows, sep = ',', encoding = encoding)

    flag_found = False
    for column in df.columns:
        if index_name in column: 
            df = df.set_index(column)
            flag_found = True
            break

    if not flag_found:
        std_out('Index not found. Cannot reindex', 'ERROR')
        return None

    # Set index
    df.index = localise_date(df.index, location)
    # Remove duplicates
    df = df[~df.index.duplicated(keep='first')]
    
    # Sort index
    df.sort_index(inplace=True)
    
    # Drop unnecessary columns
    df.drop([i for i in df.columns if 'Unnamed' in i], axis=1, inplace=True)
    
    # Check for weird things in the data
    df = df.apply(to_numeric, errors='coerce')   
    
    # Resample
    df = df.resample(frequency).mean()

    # Remove na
    df = clean(df, clean_na, how = 'all')
    
    return df

예제 #11

0

파일 보기

파일: maps.py 프로젝트: fablabbcn/smartcitizen-data

def path_plot(self,
              channel=None,
              map_type='dynamic',
              devices='all',
              start_date=None,
              end_date=None,
              options=dict()):
    '''
    Creates a folium map showing a path
    Parameters
    -------
    channel: String
        None
        If None, shows path, otherwise, colored path with channel mapping
    map_type: String
        'dynamic'
        'dynamic' or 'static'. Whether is a dinamic map or not
    devices: list or 'all'
        List of devices to include, or 'all' from self.devices
    channel: String
        The channel to make the map from
    start_date, end_date: String
        Date convertible string
    options: dict()
        Possible keys are (default otherwise)
            location: list
                [41.400818, 2.1825157]
                Center map location
            tiles: (String)
                'Stamen Toner'
                Tiles for the folium.Map
            zoom: (float)
                2.5
                Zoom to start with in folium.Map
            period: 'String'
                '1W'
                Period for 'dynamic' map
            radius: float
                10
                Circle radius for icon
            fillOpacity: float
                1
                (<1) Fill opacity for the icon
            stroke: 'String'
                'false'
                'true' or 'false'. For icon's stroke
            icon: 'String'
                'circle'
                A valid folium.Map icon style
    Returns
    -------
        Folium.Map object
    '''

    # Set defaults
    options = dict_fmerge(config._map_def_opt, options)

    # Make features
    features = []
    if devices == 'all':
        mdev = self.devices
    else:
        mdev = list()
        for device in devices:
            if device in self.devices: mdev.append(device)
            else: std_out(f'Device {device} not found, ignoring', 'WARNING')

    if len(mdev) == 0:
        std_out('Requested devices not in test', 'ERROR')
        return None

    for device in mdev:
        chs = ['GPS_LAT', 'GPS_LONG']
        if channel is not None:
            if channel not in self.devices[str(device)].readings.columns:
                std_out(
                    f'Channel {channel} not in columns: {self.devices[str(device)].readings.columns}',
                    'ERROR')
                return None

            # Get bins
            minmax = False
            if not options['minmax']:
                if all([key not in channel for key in config._channel_bins]):
                    std_out(
                        f'Requested channel {channel} not in config mapped bins {config._channel_bins.keys()}.Using min/max mapping',
                        'WARNING')
                    minmax = True
            else:
                minmax = True

            if minmax:
                bins = linspace(
                    self.devices[str(device)].readings[channel].min(),
                    self.devices[str(device)].readings[channel].max(),
                    config._channel_bin_n)
            else:
                for bname in config._channel_bins.keys():
                    if bname in channel:
                        bins = config._channel_bins[bname]
                        break
            chs.append(channel)

        # Create copy
        dfc = self.devices[str(device)].readings[chs].copy()
        # Resample and cleanup
        # TODO THIS CAN INPUT SOME MADE UP READINGS
        dfc = clean(dfc.resample(options['period']).mean(), 'fill')

        # Make color column
        legend_labels = None
        if channel is not None:
            dfc['COLOR'] = cut(dfc[channel], bins, labels =\
                config._map_colors_palette)

            # Make legend labels
            legend_labels = {}
            for ibin in range(len(bins) - 1):
                legend_labels[f'{round(bins[ibin],2)} : {round(bins[ibin+1],2)}'] =\
                    config._map_colors_palette[ibin]
        else:
            dfc['COLOR'] = config._map_colors_palette[0]

        if start_date is not None:
            dfc = dfc[dfc.index > start_date]
        if end_date is not None:
            dfc = dfc[dfc.index < end_date]

        # Add point for each date
        for date in dfc.index:
            if date == dfc.index[-1]: break
            times = []

            color = str(dfc.loc[date, 'COLOR'])
            if color == 'nan' or isnan(dfc.loc[date, 'GPS_LONG'])\
            or isnan(dfc.loc[date, 'GPS_LAT']):
                std_out(f'Skipping point {date}', 'WARNING')
                continue

            geometry = {
                'type':
                'LineString',
                'coordinates':
                [[dfc.loc[date, 'GPS_LONG'], dfc.loc[date, 'GPS_LAT']],
                 [
                     dfc.loc[date + dfc.index.freq, 'GPS_LONG'],
                     dfc.loc[date + dfc.index.freq, 'GPS_LAT']
                 ]],
            }

            properties = {
                'icon':
                options['icon'],
                'iconstyle': {
                    'fillColor': color,
                    'fillOpacity': options['fillOpacity'],
                    'stroke': options['stroke'],
                    'radius': options['radius']
                },
                'device':
                device,
                'timestamp':
                date.strftime('%Y-%m-%dT%H:%M:%S'),
                "coordinates": [
                    dfc.loc[date + dfc.index.freq, 'GPS_LAT'],
                    dfc.loc[date + dfc.index.freq, 'GPS_LONG']
                ],
                'style': {
                    'color': color,
                    'stroke-width': options['stroke-width'],
                    'fillOpacity': options['fillOpacity']
                }
            }

            # Add reading to tooltip
            if channel is not None:
                properties['channel'] = channel
                properties['value'] = dfc.loc[date, channel]

            if map_type == 'dynamic':
                properties['times'] = [
                    date.strftime('%Y-%m-%dT%H:%M:%S'),
                    (date + dfc.index.freq).strftime('%Y-%m-%dT%H:%M:%S')
                ]

            features.append({
                'type': 'Feature',
                'geometry': geometry,
                'properties': properties
            })

    featurecol = {'type': 'FeatureCollection', 'features': features}

    # Make map
    if options['location'] == 'average':
        avg_long = dfc['GPS_LONG'].mean()
        avg_lat = dfc['GPS_LAT'].mean()
        loc = [avg_lat, avg_long]
    else:
        loc = options['location']

    m = Map(
        location=loc,
        tiles=options['tiles'],
        zoom_start=options['zoom'],
    )

    if map_type == 'static':
        # TODO WORKAROUND UNTIL GEOJSON ACCEPTS MARKERS
        if options['markers']:
            for feature in features:
                Circle(location=[
                    feature['geometry']['coordinates'][0][1],
                    feature['geometry']['coordinates'][0][0]
                ],
                       fill='true',
                       radius=feature['properties']['iconstyle']['radius'],
                       color=feature['properties']['iconstyle']['fillColor'],
                       fill_opacity=feature['properties']['iconstyle']
                       ['fillOpacity']).add_to(m)

        if channel is not None:
            fields = ["device", "channel", "timestamp", "coordinates", "value"]
            aliases = [
                "Device:", "Sensor:", "Timestamp:", "Coordinates:", "Reading:"
            ]
        else:
            fields = ["device", "timestamp", "coordinates"]
            aliases = ["Device:", "Timestamp:", "Coordinates:"]

        popup = GeoJsonPopup(
            fields=fields,
            aliases=aliases,
            localize=True,
            labels=True,
            max_width=800,
        )

        tooltip = GeoJsonTooltip(
            fields=fields,
            aliases=aliases,
            localize=True,
            sticky=True,
            labels=True,
            style="""
                background-color: #F0EFEF;
                border: 1px solid gray;
                border-radius: 1px;
                box-shadow: 2px;
            """,
            max_width=800,
        )

        GeoJson(
            featurecol,
            tooltip=tooltip,
            popup=popup,
            style_function=lambda x: {
                'color': x['properties']['style']['color'],
                'weight': x['properties']['style']['stroke-width'],
                'fillOpacity': x['properties']['style']['fillOpacity']
            },
        ).add_to(m)

    elif map_type == 'dynamic':
        TimestampedGeoJson(featurecol,
                           period='PT' + convert_rollup(options['period']),
                           add_last_point=True,
                           auto_play=False,
                           loop=False,
                           max_speed=options['max_speed'],
                           loop_button=True,
                           time_slider_drag_update=True).add_to(m)

    else:
        std_out(f'Not supported map type {map_type}', 'ERROR')
        return None

    if options['minimap']:
        minimap = MiniMap(toggle_display=True, tile_layer=options['tiles'])
        minimap.add_to(m)

    if options['legend'] and not legend_labels is None:

        templateLoader = FileSystemLoader(searchpath=join(dirname(__file__),\
            'templates'))
        templateEnv = Environment(loader=templateLoader)
        template = templateEnv.get_template("map_legend.html")

        filled_map_legend = template.render(legend_labels=legend_labels)

        map_legend_html = '{% macro html(this, kwargs) %}'+\
            filled_map_legend+\
            '{% endmacro %}'

        legend = element.MacroElement()
        legend._template = element.Template(map_legend_html)

        m.get_root().add_child(legend)

    return m