Python DatabaseHandler 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: edam.reader.manage

클래스/타입: DatabaseHandler

hotexamples.com에서의 예제들: 7

Python DatabaseHandler - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 edam.reader.manage.DatabaseHandler에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

DatabaseHandler(4)

__add_dataframe__(3)

clean_df_db_dups(2)

__add_item__(2)

__check_observable_is_in_db__(2)

__check_sensor_is_in_db__(2)

get_helper_for_describe_sensor(1)

get_all_stations(1)

get_all_observables(1)

get_all_helper_observable_ids(1)

__get_helper_table_row_input_file_observable_id__(1)

__update_item__(1)

__get_station_id_by_tags_station_id__(1)

__check_unit_is_in_db__(1)

__check_station_is_in_db__(1)

__chech_helperTemplateID_is_in_db__(1)

get_observations_by_helper_id(1)

예제 #1

파일 보기

 def __init__(self, input_yaml, input_file_data=io.StringIO(), input_preamble=io.StringIO(),
              template_preamble=io.StringIO()):
     """
     """
     # TODO: Implemenet this one..
     self.sensor_id = None
     
     self.database = DatabaseHandler()
     
     self.input_yaml = input_yaml
     self.input_file = input_file_data
     self.input_preamble = input_preamble
     self.template_preamble = template_preamble
     
     self.input_yaml.seek(0)
     self.input_file.seek(0)
     self.input_preamble.seek(0)
     self.template_preamble.seek(0)
     
     self.helper_template = pd.DataFrame(columns=['observable_id',
                                                  'abstract_observable_id',
                                                  'unit_id',
                                                  'station_id',
                                                  'sensor_id'])
     # self.available_fields = ['Station', 'Observables', 'Units of Measurement', 'Sensors', 'Data inputs']
     self.station_id = []
     self.content = None
     self.all_observable_ids = list()
     
     self.handler()

예제 #2

파일 보기

    def __init__(self,
                 request,
                 procedure=None,
                 offering=None,
                 eventTime=None,
                 observedProperty=None,
                 page=None):
        self.available_requests = [
            'GetCapabilities', 'DescribeSensor', 'GetObservation'
        ]
        self.offering = offering
        self.eventTime = eventTime
        self.observedProperty = observedProperty
        self.info = OGC_SOS_CONFIGURATION
        self.keywords = list()
        self.stations = list()
        self.results = list()
        self.metadata = list()
        self.procedure = procedure
        self.sensor = None
        self.page = page
        self.template = None

        self.exception = False
        self.helper_object = None

        self.exceptionDetails = {}

        self.data = DatabaseHandler()

        if request in self.available_requests:
            self.request = request
            self.determine_request()

예제 #3

파일 보기

파일: TemplateReader.py 프로젝트: BigDataWUR/EDAM

    def __init__(self,
                 config=None,
                 input_file=io.StringIO(),
                 template=io.StringIO()):
        self.template_logger = logging.getLogger(
            'edam.reader.TemplateReader.TemplateReader')
        self.input_file = input_file
        self.template = template

        self.config = config

        self.Data = DatabaseHandler()

        self.df = None
        self.same_timestamp_arguments = None

        # I will create tuples (station,respective_dataframe,for_lines_indexed)
        # and append them in a list (parsing_tuples)
        # This process will be the first step. Parsing/storing and further processing follows.
        # station: The database id of the station for which data will be parsed
        # IMPORTANT: In case the input data is row-based and not column based
        # (i.e. australian data, see also git issue 6), we will generate a dataframe which will contain as many columns
        # as the "unique" observables of the station.

        # respective_dataframe: A dataframe which will have as index the timestamp column, and its related observables
        # will be located on the other df columns. Parsing/storing of such a df is already implemented.

        # for_lines_indexed: each dataframe should have its own for_line_indexed dictionary. Consider the example
        # of australian data. We have a number of station, each of which CAN POTENTIALLY have different observables..

        self.parsing_tuples = list()

        self.__open_template__()
        self.__set_dataframe_index_col()

        self.__create_dataframe_from_csv__()
        self.template_logger.info("I created the df from the csv")

        self.template_logger.info("I am starting handling stations")

        self.__handle_station_column__()
        self.template_logger.info("I am parsing data now")

        for station_id, station_respective_df, for_lines_indexed in self.parsing_tuples:
            self.template_logger.info("I am parsing station with %s id" %
                                      station_id)
            rows, columns = station_respective_df.shape
            self.template_logger.info("Rows: %d, Columns: %d" %
                                      (rows, columns))
            how_to_parse = __determine_how_to_group_observables__(
                df_columns_indexed=for_lines_indexed)
            self.__generate_pandas_series_from_df__(
                station_dataframe=station_respective_df,
                how_to_parse=how_to_parse,
                df_columns_indexed=for_lines_indexed,
                station_id=station_id)

예제 #4

파일 보기

class OGC_SOS:
    def __init__(self,
                 request,
                 procedure=None,
                 offering=None,
                 eventTime=None,
                 observedProperty=None,
                 page=None):
        self.available_requests = [
            'GetCapabilities', 'DescribeSensor', 'GetObservation'
        ]
        self.offering = offering
        self.eventTime = eventTime
        self.observedProperty = observedProperty
        self.info = OGC_SOS_CONFIGURATION
        self.keywords = list()
        self.stations = list()
        self.results = list()
        self.metadata = list()
        self.procedure = procedure
        self.sensor = None
        self.page = page
        self.template = None

        self.exception = False
        self.helper_object = None

        self.exceptionDetails = {}

        self.data = DatabaseHandler()

        if request in self.available_requests:
            self.request = request
            self.determine_request()

    def determine_request(self):
        if self.request == "GetCapabilities":
            self.find_keywords()
            self.find_stations()
            self.find_metadata()
            self.template = "sos/GetCapabilities.xml"
        elif self.request == "DescribeSensor":
            # self.procedure is like: station_name:sensor_name:template_id

            try:
                station_id, sensor_id, template_id = self.procedure.split(':')
                exists = self.data.get_helper_for_describe_sensor(
                    station_id=station_id,
                    sensor_id=sensor_id,
                    template_id=template_id)
                if exists:

                    self.sensor = exists
                    self.template = "sos/DescribeSensor.xml"
                else:
                    self.template = "sos/DescribeSensorException.xml"
            except:
                self.template = "sos/DescribeSensorException.xml"
        elif self.request == "GetObservation":
            try:
                station_id, sensor_id, template_id = self.procedure.split(':')
                exists = self.data.get_helper_for_describe_sensor(
                    station_id=station_id,
                    sensor_id=sensor_id,
                    template_id=template_id)
                if exists:
                    # from_time, to_time = self.eventTime.split('/')
                    # from_time = pd.to_datetime(from_time)
                    # to_time = pd.to_datetime(to_time)

                    self.helper_object = exists
                    results = self.data.get_observations_by_helper_id(
                        self.helper_object.id)
                    for row in results:
                        self.results.append(
                            Measurement(
                                value=row.value,
                                timestamp=row.timestamp,
                                observable=self.helper_object.observable,
                                uom=self.helper_object.uom,
                                station=self.helper_object.station,
                                helper=self.helper_object))
                    # self.results = [Measurement(value=row.value, timestamp=row.timestamp) for row in results]

                    self.template = "sos/GetObservation.xml"
                else:
                    self.template = "sos/GetObservationException.xml"
            except Exception as inst:
                print(inst)
                self.template = "sos/GetObservationException.xml"

    def find_keywords(self):
        [
            self.keywords.append(quantity.name)
            for quantity in self.data.get_all_observables()
        ]

    def find_stations(self):
        [
            self.stations.append(station)
            for station in self.data.get_all_stations()
        ]

    def find_metadata(self):
        [
            self.metadata.append(helper)
            for helper in self.data.get_all_helper_observable_ids()
        ]

예제 #5

파일 보기

def parse_for_iterations(input_iteration_file, template_iteration_file, iterable_type='Station'):
    """
    I presume that files are formatted as csv.
    The strategy here is as follows:
    1. Clear anything that is not enclosed in {{}}. E.g. ":".
    TODO: Do the same for actual observations
    2. Create a dataframe from the csv file
    3. Change dataframe column names to the information derived from template. E.g. 1st column is station.station_id
    (strip "station.". !Attention .split('.')[-1])
    4. Drop any columns we don't want to really parse
    
    :param input_iteration_file:
    :param template_iteration_file:
    :param iterable_type: 'Station', 'Observables', 'Unit of Measurements'
    :return:
    """
    var_for_line = re.compile(r"{%for .*? in .*?%}\n(.*)\n{%endfor%}")
    var_name = re.compile(r"({{.*?}})")
    
    temp_iteration_file = template_iteration_file
    temp_iteration_file.seek(0)
    text = temp_iteration_file.read()
    
    # for_lines = "#{{station.station_id}}:,{{station.longtitude}},{{station.latitude}}"
    for_lines = re.findall(var_for_line, text)[0]
    # variables = ['{{station.station_id}}', '{{station.longtitude}}', '{{station.latitude}}']
    variables = [var for var in re.findall(var_name, for_lines)]
    
    characters_to_be_replaced = for_lines
    for variable in variables:
        characters_to_be_replaced = characters_to_be_replaced.replace(variable, '')
    # characters_to_be_replaced = "#:,,,,"
    # Thus we should remove comma character
    characters_to_be_replaced = characters_to_be_replaced.replace(',', '')
    for character_to_be_replaced in characters_to_be_replaced:
        for_lines = for_lines.replace(character_to_be_replaced, '')
    for_lines = for_lines.split(',')
    template_for_lines_indexed = dict(enumerate(for_lines))
    
    # Determine which indexes hold variables
    # Create dataframe header
    dataframe_header = []
    for counter_index in range(0, len(template_for_lines_indexed)):
        stripped_header = template_for_lines_indexed[counter_index].strip("{{}}")  # type: str
        stripped_header = stripped_header.split('.')
        # Remove station
        iterative_type = stripped_header[0]
        del stripped_header[0]
        # This is done for tags.something
        stripped_header = '.'.join(stripped_header)
        dataframe_header.append(stripped_header)
    
    df = pd.read_csv(input_iteration_file, na_values='', header=0, names=dataframe_header)
    
    for column in df.columns:
        # first strip
        try:
            df[column] = df[column].str.strip(to_strip=characters_to_be_replaced)
        except:
            pass
        if column.startswith('tags'):
            key = str(column.split('.')[-1])
            temp_dict = dict()
            temp_dict[key] = df[column].map(str)
            
            # Update column, key: value
            df[column] = "\"" + key + "\"" + ':' + "\"" + df[column].map(str) + "\""
            
            if 'tags' in df.columns:
                df['tags'] = df['tags'] + ',' + df[column]
            else:
                df['tags'] = '{' + df[column].map(str)
            # Drop this column
            df.drop(column, axis=1, inplace=True)
    # Exception will occur for observables for which we don't have tags
    try:
        df['tags'] = df['tags'] + '}'
        # df['tags'] = df['tags'].apply(lambda x: json.loads(x))
    except:
        pass
    
    # Station data(metadata) are stored directly to the database, while observables are sent back to
    # SourceConfiguration for further processing
    
    # just check the first column if 'observable' keyword is used.
    if iterative_type == 'observable':
        temp_iteration_file.seek(0)
        return df.to_dict(orient='index')
    
    else:
        data = DatabaseHandler()
        # Remove tags json field. It creates problems with pd.merge
        
        dup_cols = list(df)
        try:
            dup_cols.remove('tags')
        except:
            pass
        if database_type == "postgres":
            tablename = 'public."' + iterable_type + '"'
        elif database_type == "sqlite":
            tablename = '"' + iterable_type + '"'
        df_to_store = data.clean_df_db_dups(df=df, tablename=tablename, dup_cols=dup_cols)  # type: pd.DataFrame
        
        data.__add_dataframe__(dataframe=df_to_store, table=iterable_type, index=False, index_label=None)
        temp_iteration_file.seek(0)
        return [int(x) for x in df_to_store.index.tolist()]

예제 #6

파일 보기

class SourceConfiguration:
    """
    This class handles configuration files, drafted by users.
    It reads it and extracts all relevant information.
    1. Creates the Station and Sensors objects. It checks if Station exists and appends related Sensors in it.
    2. Sets file locators for all corresponding input and output templates
    3. Sets locators for all corresponding inputs. Depending on the type (html, file, sql), calls the appropriate
    connector of the Connector class.
    
    A SourceConfiguration object serves as an input to TemplateReader.
    """
    
    def __init__(self, input_yaml, input_file_data=io.StringIO(), input_preamble=io.StringIO(),
                 template_preamble=io.StringIO()):
        """
        """
        # TODO: Implemenet this one..
        self.sensor_id = None
        
        self.database = DatabaseHandler()
        
        self.input_yaml = input_yaml
        self.input_file = input_file_data
        self.input_preamble = input_preamble
        self.template_preamble = template_preamble
        
        self.input_yaml.seek(0)
        self.input_file.seek(0)
        self.input_preamble.seek(0)
        self.template_preamble.seek(0)
        
        self.helper_template = pd.DataFrame(columns=['observable_id',
                                                     'abstract_observable_id',
                                                     'unit_id',
                                                     'station_id',
                                                     'sensor_id'])
        # self.available_fields = ['Station', 'Observables', 'Units of Measurement', 'Sensors', 'Data inputs']
        self.station_id = []
        self.content = None
        self.all_observable_ids = list()
        
        self.handler()
    
    def check_yaml(self):
        try:
            return yaml.load(self.input_yaml, Loader=yaml.FullLoader)
        except yaml.YAMLError as exc:
            return exc
    
    def handler(self):
        # Firstly we open the yaml
        self.content = self.check_yaml()
        
        # Check Station type
        station_type, source, template = self.__check_type_of_field(field_name='Station')
        
        if station_type == 'iterable':
            self.station_id = parse_for_iterations(input_iteration_file=source,
                                                   template_iteration_file=template, iterable_type='Station')
        else:
            self.set_station(parse_from_yaml=station_type)
        # Do the same for all other fields
        # Though this could be done only once.
        observable_type, source, template = self.__check_type_of_field(field_name='Observables')
        if observable_type == 'iterable':
            self.set_observables(iter_or_not=parse_for_iterations(input_iteration_file=source,
                                                                  template_iteration_file=template,
                                                                  iterable_type='Observables'))
        else:
            self.set_observables()
        
        # We should take care cases in which uom and sensor fields are empty.
        # If these fields are empty, it means that they should default to "unknown values"
        self.set_units_of_measurement()
        self.set_sensors()
        
        # Update dataframe. For some reason int is transformed to float. So here I revert this (for the affected columns
        self.helper_template['station_id'] = self.helper_template['station_id'].apply(int)
        self.helper_template['sensor_id'] = self.helper_template['sensor_id'].apply(int)
        self.helper_template['abstract_observable_id'] = self.helper_template['abstract_observable_id'].apply(int)
        self.helper_template['unit_id'] = self.helper_template['unit_id'].apply(int)
        
        # Now copy this dataframe so many times as the len(self.station_id)
        # self.helper_template = self.helper_template.append(temp, ignore_index=True)
        # We want to pass the first station_id since we already have this id incorporated.
        temp = self.helper_template.copy(deep=True)
        for station_id in self.station_id[1:]:
            temp['station_id'] = station_id
            self.helper_template = self.helper_template.append(temp, ignore_index=True)
        
        del temp
        # We have to check if we have duplicates!
        df_cleaned = self.database.clean_df_db_dups(df=self.helper_template, tablename='HelperTemplateIDs',
                                                    dup_cols=list(self.helper_template))
        
        self.database.__add_dataframe__(dataframe=df_cleaned, table='HelperTemplateIDs', index=False)
    
    def set_station(self, parse_from_yaml):
        """
        :return:
        """
        # With the following command I serialize a Station object from the .yaml file
        if parse_from_yaml is not None:
            station = Station.fromdictionary(self.content['Station'])  # type: Station
        else:
            station = Station()
        
        # It means metadata have to be parsed from the preambles
        if self.input_preamble.seek(0, os.SEEK_END) > 0 and self.template_preamble.seek(0, os.SEEK_END) > 0:
            self.input_preamble.seek(0)
            self.template_preamble.seek(0)
            station = extract_data_from_preamble(station, preamble_template=self.template_preamble
                                                 , preamble_input=self.input_preamble)
        
        # With the following command, I determine the existence of station object
        # with the same attributes (non-duplicate entries)
        
        exists, station_from_db = self.database.__check_station_is_in_db__(station)
        if exists:
            station_id = station_from_db.id
        else:
            station.latitude = safe_float(station.latitude)
            station.longitude = safe_float(station.longitude)
            if station.latitude is None and station.longitude is None and station.name is not None:
                geolocator = GoogleV3()
                try:
                    location = geolocator.geocode(station.name + station.region)
                except:
                    try:
                        location = geolocator.geocode(station.name)
                    except:
                        location = None
                if location is not None:
                    station.latitude = location.latitude
                    station.longitude = location.longitude
            
            _, station_id = self.database.__add_item__(station)
        
        self.station_id.append(station_id)
    
    def set_observables(self, iter_or_not=None):
        
        if iter_or_not is None:
            observables = self.content['Observables']
        else:
            observables = iter_or_not
            
            # This is where we should parse metadata of observables
            # parse_observables_with_reasoner(observables=observables)
        for obs in observables:
            observable_as_dict = obs  # type: dict
            # Deprecated
            # observable_as_dict['station_id'] = self.station_id
            observable = AbstractObservables.fromdictionary(observable_as_dict)
            exists, respective_abstract_observable_id = self.database.__check_observable_is_in_db__(observable)
            if exists:
                respective_abstract_observable_id = respective_abstract_observable_id[0]
            else:
                _, respective_abstract_observable_id = self.database.__add_item__(observable)
            
            # Create the 1/(len(station_id)) dataframe. The others would be exactly the same
            # apart the station_id section. This is derived of cedar requirements. I.e. Observables, uoms, sensors, etc
            # located in a config file regard ALL THE STATIONS in the config.
            
            temp = pd.Series({'observable_id': observable_as_dict['observable_id'],
                              'abstract_observable_id': respective_abstract_observable_id,
                              'unit_id': None,
                              'station_id': self.station_id[0],
                              'sensor_id': self.sensor_id
                              })
            
            self.all_observable_ids.append(observable_as_dict['observable_id'])
            
            self.helper_template = self.helper_template.append(temp, ignore_index=True)
    
    def set_helper_observable_ids(self, helper_template_as_dictionary):
        helperTemplateID = HelperTemplateIDs.fromdictionary(helper_template_as_dictionary)
        exists, _ = self.database.__chech_helperTemplateID_is_in_db__(helperTemplateID)
        if exists:
            pass
        else:
            _, _ = self.database.__add_item__(helperTemplateID)
    
    def set_units_of_measurement(self):
        
        if self.content['Units of Measurement'] is None:
            default_empty_uom = dict()
            default_empty_uom['name'] = "unknown"
            relevant_observables = self.all_observable_ids
            
            unit = UnitsOfMeasurement.fromdictionary(default_empty_uom)
            
            exists, unit_id = self.database.__check_unit_is_in_db__(unit)
            if exists:
                unit_id = unit_id[0]
            else:
                _, unit_id = self.database.__add_item__(unit)
            pass
            
            for observable_observable_id in relevant_observables:
                self.helper_template.loc[
                    self.helper_template['observable_id'] == observable_observable_id, 'unit_id'] = unit_id
        
        else:
            for uom in self.content['Units of Measurement']:
                uom_as_dict = uom  # type: dict
                if uom_as_dict['relevant_observables'] == '':
                    relevant_observables = self.all_observable_ids
                else:
                    relevant_observables = uom_as_dict['relevant_observables'].split(',')  # type: list
                    # remove spaces
                    relevant_observables = map(str.strip, relevant_observables)
                    # No need to keep it any more
                del uom_as_dict['relevant_observables']
                unit = UnitsOfMeasurement.fromdictionary(uom_as_dict)
                
                exists, unit_id = self.database.__check_unit_is_in_db__(unit)
                if exists:
                    unit_id = unit_id[0]
                else:
                    _, unit_id = self.database.__add_item__(unit)
                
                for observable_observable_id in relevant_observables:
                    self.helper_template.loc[
                        self.helper_template['observable_id'] == observable_observable_id, 'unit_id'] = unit_id
    
    def set_sensors(self):
        """
        :return:
        """
        if self.content['Sensors'] is None:
            default_empty_sensor = dict()
            default_empty_sensor['generic'] = True
            
            relevant_observables = self.all_observable_ids
            for observable_observable_id in relevant_observables:
                sensor = Sensors.fromdictionary(default_empty_sensor)
                # abstract_observable_id = None or id
                abstract_observable_id = \
                    self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) &
                                             (self.helper_template['observable_id'] == observable_observable_id)][
                        'abstract_observable_id'].values[0]
                abstract_observable_id = int(abstract_observable_id)
                
                unit_id = \
                    self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) &
                                             (self.helper_template['observable_id'] == observable_observable_id)][
                        'unit_id'].values[0]
                unit_id = int(unit_id)
                
                default_empty_sensor['unit_id'] = unit_id
                default_empty_sensor['abstract_observable_id'] = abstract_observable_id
                sensor.update(default_empty_sensor)
                
                exists, sensor_id = self.database.__check_sensor_is_in_db__(sensor)
                
                if exists:
                    sensor_id = sensor_id[0]
                else:
                    _, sensor_id = self.database.__add_item__(sensor)
                
                # We now need to update helper template ids table
                self.helper_template.loc[
                    (self.helper_template['station_id'] == self.station_id[0]) &
                    (self.helper_template['observable_id'] == observable_observable_id), 'sensor_id'] = sensor_id
        
        else:
            
            for sensor in self.content['Sensors']:
                sensor_as_dict = sensor  # type: dict
                if sensor_as_dict['relevant_observables'] == '':
                    relevant_observables = self.all_observable_ids
                else:
                    relevant_observables = sensor_as_dict['relevant_observables'].split(',')  # type: list
                    relevant_observables = map(str.strip, relevant_observables)
                    # No need to keep it any more
                # No need to keep it any more
                del sensor_as_dict['relevant_observables']
                # We have to retrieve abstract_observable_id from observable_id
                # We are going to do this, through the HelperTemplateIDs
                # After retrieving this id, we will update 'sensor' object, check if it's in db already
                # And finally store it.
                
                # A generic sensor can have more than one relevant_observables
                # In this case, we are going to create as many generic sensor objects as the relevant_observables..
                
                # We are going to determine the observable_id through the observable_id
                
                for observable_observable_id in relevant_observables:
                    sensor = Sensors.fromdictionary(sensor_as_dict)
                    # abstract_observable_id = None or id
                    abstract_observable_id = \
                        self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) &
                                                 (self.helper_template['observable_id'] == observable_observable_id)][
                            'abstract_observable_id'].values[0]
                    abstract_observable_id = int(abstract_observable_id)
                    
                    unit_id = \
                        self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) &
                                                 (self.helper_template['observable_id'] == observable_observable_id)][
                            'unit_id'].values[0]
                    unit_id = int(unit_id)
                    
                    sensor_as_dict['unit_id'] = unit_id
                    sensor_as_dict['abstract_observable_id'] = abstract_observable_id
                    sensor.update(sensor_as_dict)
                    sensor.generic = True
                    exists, sensor_id = self.database.__check_sensor_is_in_db__(sensor)
                    # Next line is resolving a bug introduced by sqlite
                    # For some unknown reason a str type was interpreted as dict when it came for storing
                    sensor.tags = str(sensor.tags)
                    if exists:
                        sensor_id = sensor_id[0]
                    else:
                        _, sensor_id = self.database.__add_item__(sensor)
                    
                    # We now need to update helper template ids table
                    self.helper_template.loc[
                        (self.helper_template['station_id'] == self.station_id[0]) &
                        (self.helper_template['observable_id'] == observable_observable_id), 'sensor_id'] = sensor_id
    
    def __check_type_of_field(self, field_name):
        """
        This function checks each given field values and determines the "type" of values.
        These could be in the form of "source:..., template:...", which means we need to
        iterate through source file and extract the fields. The other type is where fields
        are manually iterated in the config file, in the form of 1..., 2..., 3...
        :return: source_type (iterable, non_iterable), source (source_path, None), template (template_path, None)
        """
        # If True it means we have to extract data from iterable.
        try:
            if ('source' and 'template') in self.content[field_name]:
                source_type = 'iterable'
                source = self.content[field_name]['source']
                template = self.content[field_name]['template']
                sexists, _, sourcef, source_io_object = check_if_path_exists(source)
                texists, _, templatef, template_io_object = check_if_path_exists(template)
                
                if sexists and texists:
                    pass
                else:
                    # TODO logging!
                    raise SystemExit("%s and %s does not exist" % (source, template))
            
            else:
                source_type = 'non_iterable'
                source_io_object = io.StringIO()
                template_io_object = io.StringIO()
            
            return source_type, source_io_object, template_io_object
        except KeyError:
            # It means that yaml does not contain this field (e.g. Station, or Observable).
            # That is because metadata are PROBABLY stored in input files (preamble)
            return None, None, None

예제 #7

파일 보기

파일: TemplateReader.py 프로젝트: BigDataWUR/EDAM

class TemplateReader:
    """
    This class takes 2 inputs:
    1. Source configuration object
    2. Connector object

    From those it infers:
    1. template
    2. input_file

    and exports:
    3. A list with observation objects, ready to be stored in the database.
    """
    def __init__(self,
                 config=None,
                 input_file=io.StringIO(),
                 template=io.StringIO()):
        self.template_logger = logging.getLogger(
            'edam.reader.TemplateReader.TemplateReader')
        self.input_file = input_file
        self.template = template

        self.config = config

        self.Data = DatabaseHandler()

        self.df = None
        self.same_timestamp_arguments = None

        # I will create tuples (station,respective_dataframe,for_lines_indexed)
        # and append them in a list (parsing_tuples)
        # This process will be the first step. Parsing/storing and further processing follows.
        # station: The database id of the station for which data will be parsed
        # IMPORTANT: In case the input data is row-based and not column based
        # (i.e. australian data, see also git issue 6), we will generate a dataframe which will contain as many columns
        # as the "unique" observables of the station.

        # respective_dataframe: A dataframe which will have as index the timestamp column, and its related observables
        # will be located on the other df columns. Parsing/storing of such a df is already implemented.

        # for_lines_indexed: each dataframe should have its own for_line_indexed dictionary. Consider the example
        # of australian data. We have a number of station, each of which CAN POTENTIALLY have different observables..

        self.parsing_tuples = list()

        self.__open_template__()
        self.__set_dataframe_index_col()

        self.__create_dataframe_from_csv__()
        self.template_logger.info("I created the df from the csv")

        self.template_logger.info("I am starting handling stations")

        self.__handle_station_column__()
        self.template_logger.info("I am parsing data now")

        for station_id, station_respective_df, for_lines_indexed in self.parsing_tuples:
            self.template_logger.info("I am parsing station with %s id" %
                                      station_id)
            rows, columns = station_respective_df.shape
            self.template_logger.info("Rows: %d, Columns: %d" %
                                      (rows, columns))
            how_to_parse = __determine_how_to_group_observables__(
                df_columns_indexed=for_lines_indexed)
            self.__generate_pandas_series_from_df__(
                station_dataframe=station_respective_df,
                how_to_parse=how_to_parse,
                df_columns_indexed=for_lines_indexed,
                station_id=station_id)

    def __open_template__(self):
        """
        This function open the self.template file and stores header (self.template_header_indexed) as dictionary
        and for_lines_arguments (self.template_for_lines_indexed) as dictionary.
        :return:
        """
        self.template.seek(0)
        text = self.template.read()
        # parse header
        header = re.findall(var_parse_header, text)[0].strip('\r\n').split(',')
        # create a dictionary with indices
        header = dict(enumerate(header))
        # TODO: Please check if the following is needed and remove.
        # self.template_header_indexed = dict(enumerate(header))

        # TODO: Please check if the following is needed and remove.
        for_lines = re.findall(var_for_line,
                               text)[0].strip('\r\n').split(',{{')

        # for_lines = re.findall(var_for_line, text)[0].strip('\r\n')
        # for_lines = re.findall(var_name, for_lines)

        for_lines = list(
            map(lambda x: x if x.startswith("{{") else "{{" + x, for_lines))

        self.usecols = dict(enumerate(for_lines))
        self.usecols = list(
            filter(lambda key: re.search(var_name, self.usecols[key]),
                   self.usecols))

        # Parse only values that need to be parsed (i.e. those inside placeholders{{}})
        for_lines = list(filter(lambda x: re.search(var_name, x), for_lines))

        self.template_for_lines_indexed = dict(enumerate(for_lines))
        self.for_lines = list(
            filter(lambda x: re.search(var_name, x), for_lines))
        self.for_lines = list(map(lambda x: x.strip("{}\r\n"), self.for_lines))
        self.template_header = dict()
        for index, label in self.template_for_lines_indexed.items():
            label_to_be_stored = label.strip("{}")
            if label_to_be_stored in self.template_header.keys():
                # TODO! This is so static, just to work for BoM! EDIT
                label_to_be_stored = label_to_be_stored + ".1"
            try:
                self.template_header[label_to_be_stored] = header[index]
            except Exception as e:
                self.template_logger.error(
                    "Can't create self.template_header for %s Exception: %s %s"
                    % (label_to_be_stored, type(e).__name__, str(e)))
        self.template.seek(0)

    def __set_dataframe_index_col(self):
        """
        This function creates a list with all values that will be passed over (self.no_parse_vars).
        It looks all for_line_vars, finds the one starts with "timestamp" and sets that column as the index of the
        dataframe.
        TODO: Make this more generic
        :return:
        """
        # In some cases timestamp could be extended in more than one columns
        # E.g. 09.05.2014,14:23:34,0.004
        # Thus index should be a dictionary: {date: index1, month: index2, time:index3}
        # Columns representing above indices should be merged when construction of dataframe takes place.
        self.parse_dates = dict()
        self.parse_dates['timestamp'] = dict()
        self.parse_dates['timestamp']['indices'] = list()
        self.parse_dates['timestamp']['format'] = list()
        self.index = {}
        for index, variable in self.template_for_lines_indexed.items():
            match = re.search(var_name, variable)
            match_same_timestamp = re.search(var_same_timestamp, variable)

            if match:
                if match_same_timestamp:
                    match_same_timestamp = re.search(var_same_timestamp,
                                                     variable)

                    fn_dict = match_same_timestamp.groupdict()
                    arguments = [
                        arg.strip() for arg in fn_dict['args'].split(',')
                    ]
                    # We pick one of the two, since they are the same
                    # argumemnts = ['windm_spd.timestamp.time', 'windm_dir.timestamp.time']
                    self.same_timestamp_arguments = arguments
                    name_of_variable_without_brackets = arguments[0]
                else:
                    name_of_variable_without_brackets = re.findall(
                        var_name, variable)[0]
                    # name_of_variable_without_brackets: timestamp.date
                    # Thus splitting by '.' and taking the last item, ie. date, time, etc.
                    dict_key = name_of_variable_without_brackets.split('.')[-1]
                if name_of_variable_without_brackets.startswith("timestamp"):
                    self.parse_dates['timestamp']['indices'].append(index)
                    if dict_key.lower() == "year":
                        self.parse_dates['timestamp']['format'].append('%Y')
                    elif dict_key.lower() == "month":
                        self.parse_dates['timestamp']['format'].append('%m')
                    elif dict_key.lower() == "day":
                        self.parse_dates['timestamp']['format'].append('%d')
                    elif dict_key.lower() == "dayofyear":
                        self.parse_dates['timestamp']['format'].append('%j')
                    elif dict_key.lower() == "hour":
                        self.parse_dates['timestamp']['format'].append('%H')
                    elif dict_key.lower() == "minutes":
                        self.parse_dates['timestamp']['format'].append('%M')
                    elif dict_key.lower() == "seconds":
                        self.parse_dates['timestamp']['format'].append('%S')
                    else:
                        self.template_logger.debug("%s timestamp type" %
                                                   dict_key)
                        pass
                elif "timestamp" in name_of_variable_without_brackets:
                    # all cases where timestamp is not first (e.g. wind.timestamp..)
                    additional_timestamp = name_of_variable_without_brackets.split(
                        '.')[0] + '.timestamp'
                    if additional_timestamp not in self.parse_dates:
                        self.parse_dates[additional_timestamp] = dict()
                        # All sub-timestamps depend on the main timestamp
                        self.parse_dates[additional_timestamp][
                            'indices'] = copy.deepcopy(
                                self.parse_dates['timestamp']['indices'])

                    self.parse_dates[additional_timestamp]['indices'].append(
                        index)

    def __create_dataframe_from_csv__(self):
        """
        At the end of this function, df has as index the correct timestamp and has all non-relevant columns dropped.
        We still need to parse static information (such as timestamps) from the header, ie. the column names.
        :return:
        """
        # I don't parse the header of the input file. Instead I set the header to the template to be used for the
        # names of the df columns.
        if self.df is None:
            parse_dates = dict()
            for key, value in self.parse_dates.items():
                parse_dates[key] = value['indices']
            if self.parse_dates['timestamp']['format']:

                def date_parser(x):
                    try:
                        return pd.datetime.strptime(
                            x,
                            ' '.join(self.parse_dates['timestamp']['format']))
                    except:
                        # This exception catches the case where in datetime column we have litter (e.g. Site closed)
                        return x
            else:
                date_parser = None
            self.df = pd.read_csv(
                self.input_file,
                na_values='---',
                header=0,
                usecols=self.usecols,
                names=self.for_lines,
                warn_bad_lines=True,
                parse_dates=parse_dates,
                date_parser=date_parser,
                infer_datetime_format=True,
                keep_date_col=False,
                error_bad_lines=False,
            )
            self.df.set_index(keys=["timestamp"], drop=True, inplace=True)
            # Drop nan rows and columns
            self.df.dropna(axis=0, how='all', inplace=True)
            self.df.dropna(axis=1, how='all', inplace=True)
            # Let's create duplicate lines of same_timestamp_arguments (if any)
            if self.same_timestamp_arguments:
                arguments = copy.deepcopy(self.same_timestamp_arguments)
                arguments = list(
                    map(lambda x: x.split('.')[0] + '.timestamp', arguments))
                key_argument = copy.deepcopy(arguments[0])
                del arguments[0]
                # The following if clause is for bom data. When no windmax values are given...
                try:
                    check_list = list(
                        filter(lambda x: 'nan' not in x,
                               self.df[key_argument].values.tolist()))
                except:
                    check_list = True

                if check_list:
                    for arg in arguments:
                        self.df[arg] = self.df[key_argument]
                else:
                    self.df.drop(key_argument, axis=1, inplace=True)

        else:
            pass

    def __handle_station_column__(self):
        temp_parsing_tuples = list()
        for key in list(self.template_for_lines_indexed):
            column_name = self.template_for_lines_indexed[key].strip('{{}}')

            # I assume that if station is mentioned in the observations iteration, it will be the FK to the station
            # to which the datapoints are referring to.
            # TODO: However, this should be more generic and predict any other unforeseen cases.
            if 'station' in column_name:
                distinct_stations = self.df[column_name].unique()

                for tags_station_id in distinct_stations:
                    # If the station is in the ones the users defined in their config file...
                    database_station_id = self.__return_dbstation_id_by_tags_value__(
                        tags_station_id)
                    if database_station_id in self.config.station_id:
                        respective_station_df = self.df.loc[
                            self.df[column_name] == tags_station_id].copy()
                        respective_station_df.drop(column_name,
                                                   axis=1,
                                                   inplace=True)
                        temp_parsing_tuples.append(
                            (database_station_id, respective_station_df))
                    else:
                        self.template_logger.warning(
                            "You are trying to parse station with database id '%s'. \n"
                            "However your input file does not contain "
                            "a station with this id. \n Program will exit shortly... \n"
                            % tags_station_id)

                self.template_for_lines_indexed.pop(key)
                break

        # Update template_for_line_indexed dictionary
        # This is an essential step since data parsing is based on this dictionary
        # Now let's check if we have the case of the australian data.
        # That is, if we have row-based data

        # if 'observable.observable_id' == column_name:
        #     unique_observables = self.df[column_name].unique()

        # I will now create a new df column for each of those unique observables.
        # The values of those

        temp_temp_parsing_tuples = list()

        if '{{observable.observable_id}}' in self.template_for_lines_indexed.values(
        ):
            for station, resp_df in temp_parsing_tuples:
                resp_df = resp_df.pivot_table(
                    index=resp_df.index.name,
                    columns='observable.observable_id',
                    values='observable.observable_id.value')
                # resp_df = resp_df.pivot(columns='observable.observable_id', values='observable.observable_id.value')
                # try:
                #     resp_df = resp_df.pivot(columns='observable.observable_id', values='observable.observable_id.value')
                # except:
                #     print(resp_df)
                temp_temp_parsing_tuples.append((station, resp_df))
            del temp_parsing_tuples
            temp_parsing_tuples = temp_temp_parsing_tuples
        del temp_temp_parsing_tuples
        # If list is empty!
        if not temp_parsing_tuples:
            # I assume that in config one and only one station was defined..
            one_tuple = (self.config.station_id[0], self.df)
            temp_parsing_tuples.append(one_tuple)
            # self.parsing_tuples.append(one_tuple)

        # At this point we have dataframes which have timestamp as index, and other columns are for the observables
        for station_id, station_df in temp_parsing_tuples:
            temp_for_lines_indexed = dict()
            for column_name in list(station_df):
                # TODO: curly brackets are used for continuity purposes
                temp_for_lines_indexed[station_df.columns.get_loc(
                    column_name)] = "{{" + column_name + "}}"
            tuple_to_be_added = station_id, station_df, temp_for_lines_indexed
            self.parsing_tuples.append(tuple_to_be_added)

    def __update_index_timestamp_from_column__(self,
                                               station_df,
                                               var,
                                               grouped_col=False):
        # Do something with only columns with additional timestamps
        # check if we need to parse data from header
        regex = re.compile(r'{%.?set (.*?).?%}')
        try:
            local_var = self.template_header[var]
        except Exception as e:
            self.template_logger.warning("%s %s" % (type(e).__name__, str(e)))
            self.template_logger.warning("I am returning df without updates")
            new_df = pd.DataFrame()
            new_df["value"] = pd.Series(station_df[var],
                                        index=station_df.index)
            return new_df
        match = re.search(regex, local_var)
        if grouped_col:
            new_df = station_df

        else:
            new_df = pd.DataFrame()
            new_df["value"] = pd.Series(station_df[var],
                                        index=station_df.index)
        if match:
            # example: {{wind.timestamp.hour=9}}
            fullname_in_list = re.findall(regex, local_var)[0]

            # example: ['wind.timestamp.hour', '9']
            splitted_by_equal_sign = __get_statements_from_placeholders__(
                fullname_in_list)

            # example: hour
            # but we need only the first letter
            # thus [0]
            unit = splitted_by_equal_sign[0].split('.')[-1][0]

            new_df.index += pd.TimedeltaIndex(pd.Series(
                np.full(new_df.shape[0], int(splitted_by_equal_sign[1]))),
                                              unit=unit)

        return new_df

    def __determine_observable_id_from_db__(self, var,
                                            station_id) -> HelperTemplateIDs:
        """
        :param var: {{temp.value}}
        :return:
        """
        var = re.findall(var_name, var)[0].split('.')[0]
        helper_template_row = self.Data.__get_helper_table_row_input_file_observable_id__(
            var, station_id)

        return helper_template_row

    def __generate_pandas_series_from_df__(self, station_dataframe,
                                           how_to_parse, df_columns_indexed,
                                           station_id):
        for col_index in how_to_parse:
            # if True it means the column should be parsed independently.
            # if False (list type), another for loop should be implemented
            # dataframe_to_store = pd.DataFrame()
            if type(col_index) is int:
                col_index = int(col_index)
                var = station_dataframe[
                    station_dataframe.columns[col_index]].name
                dataframe_to_store = self.__update_index_timestamp_from_column__(
                    station_df=station_dataframe,
                    var=var)  # type: pd.DataFrame()
                helper_id_row = self.__determine_observable_id_from_db__(
                    df_columns_indexed[col_index], station_id)

                dataframe_to_store["helper_observable_id"] = pd.Series(
                    helper_id_row.id, index=dataframe_to_store.index)

                update_helper_with_meta = dict()
                try:
                    update_helper_with_meta['frequency'] = pd.infer_freq(
                        dataframe_to_store.index)
                    update_helper_with_meta[
                        'start_date'] = dataframe_to_store.index[0]
                    update_helper_with_meta[
                        'end_date'] = dataframe_to_store.index[-1]
                    update_helper_with_meta[
                        'number_of_observations'] = dataframe_to_store.__len__(
                        )
                    self.__update_helper_observable_id(
                        helper_id_row, update_helper_with_meta)
                except:
                    pass
            else:
                dataframe_to_store = pd.DataFrame(
                    index=station_dataframe.index)
                exits = False
                for grouped_column in col_index[:]:
                    if type(grouped_column) is str:
                        grouped_column = int(grouped_column)

                        col_index.remove(str(grouped_column))
                        exits = True
                        break
                dataframe_to_store[
                    'value'] = station_dataframe.iloc[:, col_index].astype(
                        str).apply(lambda x: ' '.join(x), axis=1)
                if exits and type(grouped_column) is int:
                    dataframe_to_store.index = station_dataframe[
                        station_dataframe[
                            station_dataframe.columns[grouped_column]].name]

                # We don't care which col_index we will select. They are both referring to the same entity
                # They are grouped after all
                var = station_dataframe[station_dataframe.columns[
                    col_index[0]]].name
                # Update index from column

                dataframe_to_store = self.__update_index_timestamp_from_column__(
                    station_df=dataframe_to_store, var=var,
                    grouped_col=True)  # type: pd.DataFrame()

                helper_id_row = self.__determine_observable_id_from_db__(
                    df_columns_indexed[int(col_index[0])], station_id)
                dataframe_to_store["helper_observable_id"] = pd.Series(
                    helper_id_row.id, index=dataframe_to_store.index)
                update_helper_with_meta = dict()
                try:
                    update_helper_with_meta['frequency'] = pd.infer_freq(
                        dataframe_to_store.index)
                    update_helper_with_meta[
                        'start_date'] = dataframe_to_store.index[0]
                    update_helper_with_meta[
                        'end_date'] = dataframe_to_store.index[-1]
                    update_helper_with_meta[
                        'number_of_observations'] = dataframe_to_store.__len__(
                        )
                    self.__update_helper_observable_id(
                        helper_id_row, update_helper_with_meta)
                except:
                    pass

            # Clean from duplicate records
            # dataframe_to_store = self.Data.clean_df_db_dups(df=dataframe_to_store, tablename="Observations",
            #                                                 dup_cols=list(dataframe_to_store))

            self.Data.__add_dataframe__(dataframe_to_store)

    def __check_if_observable_is_stored__(self, observable):
        return self.Data.__check_observable_is_in_db__(observable)

    def __return_dbstation_id_by_tags_value__(self, station_id):
        """
        #TODO: This function should be more generic. E.g. By placeholder value
        In the template we have something like: {{station.tags.station_id}}
        Logic of this program is "smart" enough to identify that station_id is a key of the JSON type "tag".
        Thus, this can be handled automatically in the future..
        :param station_id: tags station_id, e.g. 210
        :return: a dictionary. old_value:new_value
        """

        database_station_id = self.Data.__get_station_id_by_tags_station_id__(
            station_id)
        # for old_value, new_value in temp_dict.items():
        #     self.df[column_name] = self.df[column_name].replace(to_replace=old_value, value=new_value)
        return database_station_id

    def __check_if_sensor_is_stored__(self, sensor):
        """
        
        :param sensor:
        :return: True if it exists, False if it does not
        """
        return self.Data.__check_sensor_is_in_db__(sensor)

    def __store_item_in_db(self, item):
        self.Data.__add_item__(item)

    def __update_helper_observable_id(self,
                                      helper_observable_id: HelperTemplateIDs,
                                      meta_dictionary):
        # helper_observable_id.update_meta(metadata_in_dict=meta_dictionary)
        self.Data.__update_item__(helper_observable_id,
                                  metadata_dict=meta_dictionary)