Exemplo n.º 1
0
    def module_task(self, params):

        self.logger.info(
            'Starting Hbase-HBase ETL using Hadoop to clean billing data...')
        """CHECK INCONSISTENCIES IN params"""
        try:
            result_companyId = params['result_companyId']
            data_companyId = params[
                'data_companyId'] if 'data_companyId' in params else []
            ts_to = params['ts_to']
            ts_from = params[
                'ts_from'] if 'ts_from' in params else date_n_month(
                    ts_to, -96)
            energyTypeList = params['type'] if 'type' in params else []
        except KeyError as e:
            raise Exception('Mandatory Parameter not provided: {}'.format(e))

        ######################################################################################################################################################################################
        """ GET DATA FROM MONGO TO MAKE QUERYS """
        ######################################################################################################################################################################################
        if not energyTypeList:
            energyTypeList = list(
                set([
                    x['type']
                    for x in self.mongo['readings'].find({}, {'type': 1})
                ]))

        if not data_companyId:
            data_companyId = list(
                set([
                    x['companyId']
                    for x in self.mongo['companies'].find({}, {'companyId': 1})
                ]))

        ######################################################################################################################################################################################
        """ HIVE QUERY TO GET HBASE DATA """
        ######################################################################################################################################################################################
        for measure_config in self.config['measures']:
            # Create temp tables with hbase data, add them to context_clean to be deleted after execution
            tables = []
            type_table_name = measure_config["hbase_table"]
            tables_source = []
            tables_energyType = []
            self.logger.info('creating {} tables for {} and {}'.format(
                type_table_name, energyTypeList, data_companyId))

            tables_list = self.hbase.tables()
            for energyType in energyTypeList:
                for companyId in data_companyId:
                    try:
                        table_name = "{}_{}_{}".format(type_table_name,
                                                       energyType, companyId)
                        if table_name not in tables_list:
                            continue
                        hbase_table_name = "{}{}{}".format(
                            self.config['hbase']['db'],
                            self.config['hbase']['db_separator'], table_name)
                        keys = measure_config['hbase_keys']
                        columns = measure_config['hbase_columns']
                        temp_table = create_hive_table_from_hbase_table(
                            self.hive, table_name, hbase_table_name, keys,
                            columns, self.task_UUID)
                        tables.append(temp_table)
                        self.context.add_clean_hive_tables(temp_table)
                        tables_energyType.append(energyType)
                        tables_source.append(companyId)
                        self.logger.debug(
                            "Created table: {}".format(temp_table))
                    except Exception as e:
                        self.logger.debug("Error creating table: {}".format(e))
            self.logger.debug(len(tables))

            fields = measure_config["hive_fields"]

            location = measure_config['measures'].format(UUID=self.task_UUID)
            self.context.add_clean_hdfs_file(location)
            input_table = create_hive_module_input_table(
                self.hive, measure_config['temp_input_table'], location,
                fields, self.task_UUID)

            #add input table to be deleted after execution
            self.context.add_clean_hive_tables(input_table)
            qbr = RawQueryBuilder(self.hive)
            select = ", ".join(
                [f[0] for f in measure_config["sql_sentence_select"]])
            sentence = """
                INSERT OVERWRITE TABLE {input_table}
                SELECT {select} FROM
                ( """.format(select=select, input_table=input_table)
            letter = ["a{}".format(i) for i in range(len(tables) + 1)]
            text = []
            for index, tab in enumerate(tables):
                var = letter[index]
                energy_type = tables_energyType[index]
                source = tables_source[index]
                select = ", ".join([
                    f[1] for f in measure_config["sql_sentence_select"]
                ]).format(var=var, energy_type=energy_type, source=source)
                where = measure_config["sql_where_select"].format(
                    var=var, ts_from=ts_from, ts_to=ts_to)
                text.append(""" SELECT {select} FROM {tab} {var}
                                  WHERE
                                      {where}
                                  """.format(var=var,
                                             select=select,
                                             tab=tab,
                                             where=where))
            sentence += """UNION
                        """.join(text)
            sentence += """) unionResult """

            self.logger.debug(sentence)
            try:
                qbr.execute_query(sentence)
            except:
                continue

        #####################################################################################################################################################################################
        """ SETUP MAP REDUCE JOB """
        ######################################################################################################################################################################################
        # remove previous raw_data results
        output_fields = self.config['output']['fields']
        clean_tables = []
        for measure_config in self.config['measures']:
            clean_file_name = measure_config['clean_output_file'].format(
                UUID=self.task_UUID)
            self.context.add_clean_hdfs_file(clean_file_name)
            clean_table_name = measure_config['clean_output_table']
            self.logger.debug('Launching MR job to clean the daily data')
            try:
                # Launch MapReduce job
                self.launcher_hadoop_job(
                    measure_config['type'],
                    measure_config['measures'].format(UUID=self.task_UUID),
                    clean_file_name, result_companyId)
            except Exception as e:
                raise Exception('MRJob process has failed: {}'.format(e))

            clean_table = create_hive_module_input_table(
                self.hive, clean_table_name, clean_file_name, output_fields,
                self.task_UUID)
            self.context.add_clean_hive_tables(clean_table)
            clean_tables.append([clean_table, measure_config['type']])
            self.logger.debug("MRJob finished for {}".format(
                measure_config['type']))

        ######################################################################################################################################################################################
        """ Join the output in a hive table """
        ######################################################################################################################################################################################

        output_file_name = self.config['output']['output_file_name']
        output_hive_name = self.config['output']['output_hive_table']
        output_hive_table = create_hive_module_input_table(
            self.hive, output_hive_name, output_file_name, output_fields)
        try:
            for i in self.hdfs.delete([output_file_name], recurse=True):
                try:
                    i
                except:
                    pass
        except:
            pass
        select = ", ".join(
            [f[0] for f in self.config['output']["sql_sentence_select"]])
        sentence = """
                        INSERT OVERWRITE TABLE {output_table}
                        SELECT {select} FROM
                        ( """.format(select=select,
                                     output_table=output_hive_table)
        letter = ["a{}".format(i) for i in range(len(clean_tables) + 1)]
        text = []
        for index, tab in enumerate(clean_tables):
            var = letter[index]
            select = ", ".join([
                f[1] for f in self.config['output']["sql_sentence_select"]
            ]).format(var=var, data_type=tab[1])
            text.append(""" SELECT {select} FROM {tab} {var}
                              """.format(var=var, select=select, tab=tab[0]))
        sentence += """UNION
                    """.join(text)
        sentence += """) unionResult """

        self.logger.debug(sentence)
        qbr = RawQueryBuilder(self.hive)
        qbr.execute_query(sentence)

        self.logger.info(
            'MHbase-HBase ETL clean billing data execution finished...')
Exemplo n.º 2
0
    def module_task(self, params):
        self.logger.info('Starting Module for edinet baseline...')
        """CHECK INCONSISTENCIES IN params"""
        try:
            result_companyId = params['result_companyId']
            ts_to = params['ts_to']
            ts_from = params[
                'ts_from'] if 'ts_from' in params else date_n_month(
                    ts_to, -24)
            energyTypeList = params['type'] if 'type' in params else []
            save_data_debug = True if 'debug' in params and params[
                'debug'] else False
        except KeyError as e:
            raise Exception(
                'Not enough parameters provided to module: {}'.format(e))

        #####################################################################################################################################################################################
        """  LOAD DATA FROM MONGO MODELLING UNITS  """
        ######################################################################################################################################################################################

        self.logger.info('Extracting modelling_units from mongo')

        # setting variables for readability
        modelling_units_collection = self.config['mongodb'][
            'modelling_units_collection']
        weather_stations_collection = self.config['mongodb'][
            'weather_stations_collection']
        lon_lat_tz_dict = {}
        tf = TimezoneFinder(in_memory=True)
        self.logger.debug('Querying for weather station info in MongoDB')
        cursor = self.mongo[weather_stations_collection].find({})
        for station in cursor:
            lon_lat_tz_dict[station['stationId']] = {
                "lat":
                station['latitude'],
                "lon":
                station['longitude'],
                "tz":
                tf.timezone_at(lat=station['latitude'],
                               lng=station['longitude'])
            }
        cursor.close()
        tf = None
        device_key = {}
        stations = {}
        solar_station = {}
        self.logger.debug('Querying for modelling unit info in MongoDB')
        cursor = self.mongo[modelling_units_collection].find({})
        for item in cursor:

            if len(item['devices']) > 0 and item[
                    'stationId'] != "Unknown":  # to avoid empty list of devices
                for dev in item['devices']:

                    stations[dev['deviceId']] = item[
                        'stationId'] if 'stationId' in item else None
                    solar_station[dev['deviceId']] = item[
                        'solar_station'] if 'solar_station' in item else None
                    key_str = "{modelling}~{devices}~{lat}~{lon}~{tz}".format(
                        modelling=item['modellingUnitId'],
                        devices=item['devices'],
                        lat=lon_lat_tz_dict[item['stationId']]['lat'],
                        lon=lon_lat_tz_dict[item['stationId']]['lon'],
                        tz=lon_lat_tz_dict[item['stationId']]['tz'])
                    if dev['deviceId'] in device_key.keys():
                        device_key[dev['deviceId']].append(key_str)
                    else:
                        device_key[dev['deviceId']] = [key_str]
        cursor.close()
        self.logger.info('A mongo query process has loaded {} devices'.format(
            len(device_key.keys())))

        ######################################################################################################################################################################################
        """ CREATE INPUT DATA FROM HIVE TABLES """
        ######################################################################################################################################################################################
        # create a table to link devices with stations
        self.logger.debug('creating weather hive table')

        weather_stations_df = pd.DataFrame(data={
            "deviceId": list(stations.keys()),
            "stationId": list(stations.values())
        },
                                           columns=["deviceId", "stationId"])

        f_station = NamedTemporaryFile(delete=False, suffix='.csv')
        weather_stations_df.to_csv(f_station.name, header=None, index=None)

        call([
            "hadoop", "fs", "-mkdir", "-p", f_station.name,
            self.config['paths']['stations']
        ])
        call([
            "hadoop", "fs", "-copyFromLocal", f_station.name,
            self.config['paths']['stations']
        ])
        weather_stations = create_hive_module_input_table(
            self.hive,
            'edinet_weather_stations_table',
            self.config['paths']['stations'], [('deviceId', 'string'),
                                               ('stationId', 'string')],
            self.task_UUID,
            sep=",")
        self.context.add_clean_hive_tables(weather_stations)

        # create a table to link devices with solar_stations
        self.logger.debug('creating solar hive table')

        solar_stations_df = pd.DataFrame(data={
            "deviceId":
            list(solar_station.keys()),
            "stationId":
            list(solar_station.values())
        },
                                         columns=["deviceId", "stationId"])
        f_solar_station = NamedTemporaryFile(delete=False, suffix='.csv')
        solar_stations_df.to_csv(f_solar_station.name, header=None, index=None)

        call([
            "hadoop", "fs", "-mkdir", "-p", f_solar_station.name,
            self.config['paths']['solar_stations']
        ])
        call([
            "hadoop", "fs", "-copyFromLocal", f_solar_station.name,
            self.config['paths']['solar_stations']
        ])
        solar_stations = create_hive_module_input_table(
            self.hive,
            'edinet_solar_stations_table',
            self.config['paths']['solar_stations'], [('deviceId', 'string'),
                                                     ('stationId', 'string')],
            self.task_UUID,
            sep=",")
        self.context.add_clean_hive_tables(solar_stations)

        # create a table with the devices values
        self.logger.debug('creating input table')

        final_table_fields = [
            [x[0], x[1]] for x in self.config['hive']['final_table_fields']
        ]

        location = self.config['paths']['measures']

        input_table = create_hive_module_input_table(
            self.hive, self.config['hive']['job_table_name'], location,
            final_table_fields, self.task_UUID)

        #add input table to be deleted after execution
        self.context.add_clean_hive_tables(input_table)
        self.logger.debug('creating hive query')

        qbr = RawQueryBuilder(self.hive)
        total_select_joint = ", ".join([
            "{}.{}".format(x[2], x[0])
            for x in self.config['hive']['final_table_fields']
        ])
        sentence = """
            INSERT OVERWRITE TABLE {input_table}
            SELECT {total_select_joint} FROM
                (SELECT ai.deviceid as deviceId, ai.ts as ts, ai.value as value, ai.energyType as energyType FROM edinet_hourly_consumption ai
                    WHERE
                        ai.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND
                        ai.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss") AND
                        ai.deviceid IN ({devices})) a
                JOIN {weather_stations} b on a.deviceId==b.deviceId
                JOIN {solar_stations} b1 on a.deviceId==b1.deviceId
                JOIN  edinet_meteo c on b.stationId==c.stationId and SUBSTR(FROM_UNIXTIME(a.ts), 1, 13) == SUBSTR(FROM_UNIXTIME(c.ts), 1, 13)
                JOIN  edinet_meteo d on b1.stationId==d.stationId and SUBSTR(FROM_UNIXTIME(a.ts), 1, 13) == SUBSTR(FROM_UNIXTIME(d.ts), 1, 13)

                """.format(input_table=input_table,
                           total_select_joint=total_select_joint,
                           ts_from=ts_from,
                           ts_to=ts_to,
                           weather_stations=weather_stations,
                           solar_stations=solar_stations,
                           devices=", ".join("\"{}\"".format(x)
                                             for x in list(device_key.keys())))

        self.logger.debug(sentence)
        qbr.execute_query(sentence)

        #####################################################################################################################################################################################
        """  LOAD from MONGO to HBASE  """
        ######################################################################################################################################################################################
        self.logger.info('Getting')
        try:
            # Launch MapReduce job
            ## Buffered measures to HBase
            self.logger.debug('Baseline Calculation')
            self.launcher_hadoop_job(location, device_key, result_companyId,
                                     save_data_debug)
        except Exception as e:
            raise Exception('MRJob ALIGN process job has failed: {}'.format(e))
        self.logger.info('Module EDINET_baseline execution finished...')
Exemplo n.º 3
0
    def module_task(self, params):
        self.logger.info('Starting Module for edinet comparisons ...')
        """CHECK INCONSISTENCIES IN params"""
        try:
            result_companyId = params['result_companyId']
            ts_to = params['ts_to']
            ts_from = params[
                'ts_from'] if 'ts_from' in params else date_n_month(
                    ts_to, -48)
            energyTypeDict = params['type'] if 'type' in params else {
                'heatConsumption': 'gasConsumption',
                'gasConsumption': 'gasConsumption',
                'monthlyElectricityConsumption': 'electricityConsumption',
                'electricityConsumption': 'electricityConsumption'
            }
        except KeyError as e:
            raise Exception(
                'Not enough parameters provided to module: {}'.format(e))

        #####################################################################################################################################################################################
        """  LOAD from MONGO  """
        ######################################################################################################################################################################################
        # Get the link between the devices and the modelling units. In the form of a dict {"device":{modelling_unit~{device:multiplier}}
        self.logger.info('Extracting data from mongodb')
        modelling_units_collection = self.config['mongodb'][
            'modelling_units_collection']
        cursor = self.mongo[modelling_units_collection].find({})
        device_key = {}

        def get_building(modelling_unit, mongo, building_collection,
                         reporting_collection):
            building = mongo[building_collection].find_one(
                {"modellingUnits": modelling_unit})
            if not building:
                reporting = mongo[reporting_collection].find_one(
                    {"modelling_Units": modelling_unit})
                if reporting and "reportingUnitId" in reporting:
                    building = mongo[building_collection].find_one(
                        {"buildingId": reporting['reportingUnitId']})
            if not building:
                return None
            return building

        building_collection = self.config['mongodb']['buildings_collection']
        reporting_collection = self.config['mongodb']['reporting_collection']
        self.logger.debug("generating the device_key dict")
        for item in cursor:
            #self.logger.debug(item)
            #self.logger.debug("gettinng item building {}".format(item['modellingUnitId']))
            building = get_building(item['modellingUnitId'], self.mongo,
                                    building_collection, reporting_collection)
            #self.logger.debug("obtained building {}".format(building))

            if building and 'data' in building and 'areaBuild' in building[
                    'data']:
                surface = building["data"]["areaBuild"]
            else:
                surface = None
            #self.logger.debug("area of building: {}".format(surface))
            if len(item['devices']
                   ) > 0 and surface:  # to avoid empty list of devices
                #self.logger.debug("list of devices {}".format(item['devices']))
                for dev in item['devices']:
                    key_str = "{modelling}~{devices}~{area}".format(
                        modelling=item['modellingUnitId'],
                        devices=item['devices'],
                        area=surface)
                    if dev['deviceId'] in device_key.keys():
                        device_key[dev['deviceId']].append(key_str)
                    else:
                        device_key[dev['deviceId']] = [key_str]
            #self.logger.debug("finished for {}".format(item['modellingUnitId']))
        cursor.close()
        self.logger.info('A mongo query process has loaded {} devices'.format(
            len(device_key.keys())))

        ######################################################################################################################################################################################
        """ HIVE QUERY TO PREPARE DATA THAT HAS TO BE LOADED INTO MONGO """
        ######################################################################################################################################################################################

        # create a table with the devices values that will be the input of the MRJob that creates the monthly datatable.
        self.logger.debug('creating input table to aggregate monthly')
        final_table_fields = [
            [x[0], x[1]] for x in self.config['hive']['final_table_fields']
        ]

        location = self.config['paths']['monthly_aggregation']

        input_table = create_hive_module_input_table(
            self.hive, self.config['hive']['job_table_name'], location,
            final_table_fields, self.task_UUID)

        #add input table to be deleted after execution
        self.context.add_clean_hive_tables(input_table)
        self.logger.debug('creating hive query')
        qbr = RawQueryBuilder(self.hive)

        total_select_joint = ", ".join([
            "{}.{}".format(x[2], x[0])
            for x in self.config['hive']['final_table_fields']
        ])
        sentence = """
            INSERT OVERWRITE TABLE {input_table}
            SELECT {total_select_joint} FROM
                (SELECT ai.deviceid as deviceId, ai.ts as ts, ai.value as value, ai.energyType as energyType, ai.source as source FROM edinet_daily_consumption ai
                    WHERE
                        ai.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND
                        ai.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss") AND
                        ai.deviceid IN ({devices})) a
                """.format(input_table=input_table,
                           total_select_joint=total_select_joint,
                           ts_from=ts_from,
                           ts_to=ts_to,
                           devices=", ".join("\"{}\"".format(x)
                                             for x in list(device_key.keys())))
        self.logger.debug(sentence)
        qbr.execute_query(sentence)
        self.hive.close()
        gc.collect()
        ######################################################################################################################################################################################
        """ MAPREDUCE TO AGGREGATE MONTHLY DATA """
        ######################################################################################################################################################################################
        self.logger.info('Running Mapreduce for Montly Aggregation')
        output_location = self.config['paths']['output_monthly_aggregation']
        try:
            # Launch MapReduce job
            ## Buffered measures to HBase
            self.logger.debug('Montly Aggregation')
            self.aggregate_hadoop_job(location, output_location, device_key,
                                      result_companyId)
        except Exception as e:
            raise Exception('MRJob ALIGN process job has failed: {}'.format(e))

        output_fields = [["modellingUnit", "string"], ["ts", "bigint"],
                         ["value", "float"], ["energyType", "string"]]
        aggregated_table_name = self.config['hive'][
            'output_monthly_aggregation']
        aggregated_table = create_hive_module_input_table(
            self.hive, aggregated_table_name, output_location, output_fields,
            self.task_UUID)
        self.context.add_clean_hive_tables(aggregated_table)
        self.logger.debug("MRJob for monthly aggregation finished")
        ######################################################################################################################################################################################
        """ MAPREDUCE TO CALCULATE BENCHMARKING """
        ######################################################################################################################################################################################
        self.logger.debug('creating benchmarking information table')
        building_collection = self.config['mongodb']['buildings_collection']
        cursor = self.mongo[building_collection].find({})
        buildings_list = []
        for item in cursor:
            if not 'modellingUnits' in item or not 'data' in item:
                continue
            if not 'useType' in item[
                    'data'] or not 'organizationLevel1' in item['data']:
                continue
            for modelling in item['modellingUnits']:
                b_dic = {
                    "modellingunit": modelling,
                    "type": item['data']['useType'],
                    "organization": item['data']['organizationLevel1']
                }
                buildings_list.append(b_dic)
        cursor.close()

        buildings_df = pd.DataFrame.from_records(
            buildings_list, columns=['modellingunit', 'type', 'organization'])
        f_station = NamedTemporaryFile(delete=False, suffix='.csv')
        buildings_df.to_csv(f_station.name, header=None, index=None)
        call([
            "hadoop", "fs", "-mkdir", "-p", f_station.name,
            self.config['paths']['building_info']
        ])
        call([
            "hadoop", "fs", "-copyFromLocal", f_station.name,
            self.config['paths']['building_info']
        ])
        building_table = create_hive_module_input_table(
            self.hive,
            self.config['hive']['building_info_table'],
            self.config['paths']['building_info'],
            [('modellingunit', 'string'), ('type', 'string'),
             ('organization', 'string')],
            self.task_UUID,
            sep=",")
        self.context.add_clean_hive_tables(building_table)

        self.logger.debug('creating hive query to join data with information')
        qbr = RawQueryBuilder(self.hive)
        location = self.config['paths']['benchmarking_data']
        benchmarking_field = self.config['hive']['benchmarking_table_fields']
        benchmarking_table = create_hive_module_input_table(
            self.hive, self.config['hive']['benchmarking_table'], location,
            benchmarking_field, self.task_UUID)

        total_select_joint = ", ".join(
            ["{}.{}".format(x[2], x[0]) for x in benchmarking_field])
        sentence = """
           INSERT OVERWRITE TABLE {input_table}
           SELECT {total_select_joint} FROM
               (SELECT * FROM {aggregated_table}) a
               JOIN {building_table} b on a.modellingUnit==b.modellingUnit
               """.format(input_table=benchmarking_table,
                          total_select_joint=total_select_joint,
                          aggregated_table=aggregated_table,
                          building_table=building_table)
        self.logger.debug(sentence)
        qbr.execute_query(sentence)

        self.logger.info('Running Mapreduce for Benchmarking')
        try:
            # Launch MapReduce job
            ## Buffered measures to HBase
            self.logger.debug('Benchmarking_calculation')
            self.benchmarking_hadoop_job(location, energyTypeDict,
                                         result_companyId)
        except Exception as e:
            raise Exception('MRJob ALIGN process job has failed: {}'.format(e))

        self.logger.debug("MRJob for benchmarking finished")
Exemplo n.º 4
0
    def module_task(self, params):
        self.logger.info('Starting Module for edinet baseline...')
        """CHECK INCONSISTENCIES IN params"""
        try:
            result_companyId = params['result_companyId']
            ts_to = params['ts_to']
            ts_from = params[
                'ts_from'] if 'ts_from' in params else date_n_month(
                    ts_to, -24)
            energyTypeList = params['type'] if 'type' in params else []
        except KeyError as e:
            raise Exception(
                'Not enough parameters provided to module: {}'.format(e))

        ######################################################################################################################################################################################
        """ GET DATA FROM MONGO TO MAKE QUERYS """
        ######################################################################################################################################################################################
        if not energyTypeList:
            energyTypeList = list(
                set([
                    x['type']
                    for x in self.mongo['readings'].find({}, {'type': 1})
                ]))

        #####################################################################################################################################################################################
        """  LOAD DATA FROM HIVE  """
        ######################################################################################################################################################################################

        self.logger.info('Extracting data from mongodb')

        # setting variables for readability
        collection = self.config['mongodb']['modelling_units_collection']

        self.logger.debug('Querying for modelling units in MongoDB')
        cursor = self.mongo[collection].find({})

        device_key = {}
        stations = {}
        for item in cursor:
            if len(item['devices']) > 0:  # to avoid empty list of devices
                for dev in item['devices']:
                    stations[str(dev['deviceId'].encode('utf-8'))] = str(
                        item['stationId']) if 'stationId' in item else None
                    if str(dev['deviceId'].encode(
                            'utf-8')) in device_key.keys():
                        device_key[str(
                            dev['deviceId'].encode('utf-8'))].append(
                                str(item['modellingUnitId']) + '~' +
                                str(item['devices']))
                    else:
                        device_key[str(dev['deviceId'].encode('utf-8'))] = [
                            str(item['modellingUnitId']) + '~' +
                            str(item['devices'])
                        ]

        self.logger.info('A mongo query process has loaded {} devices'.format(
            len(device_key.keys())))

        ######################################################################################################################################################################################
        """ HIVE QUERY TO PREPARE DATA FOR MRJOB """
        ######################################################################################################################################################################################
        # create a table to link devices with stations
        device_stations_df = pd.DataFrame(data={
            "deviceId": stations.keys(),
            "stationId": stations.values()
        },
                                          columns=["deviceId", "stationId"])
        f = NamedTemporaryFile(delete=False, suffix='.csv')
        device_stations_df.to_csv(f.name, header=None, index=None)
        f.close()
        call([
            "hadoop", "fs", "-mkdir", "-p", f.name,
            self.config['paths']['stations']
        ])
        call([
            "hadoop", "fs", "-copyFromLocal", f.name,
            self.config['paths']['stations']
        ])
        f.unlink(f.name)
        device_stations = create_hive_module_input_table(
            self.hive,
            'edinet_device_stations_table',
            self.config['paths']['stations'], [('deviceId', 'string'),
                                               ('stationId', 'string')],
            self.task_UUID,
            sep=",")
        self.context.add_clean_hive_tables(device_stations)

        # create a table with the devices values

        fields = [('deviceId', 'string'), ('ts', 'int'), ('value', 'float'),
                  ('energyType', 'string'), ('source', 'string'),
                  ('temperature', 'string')]

        location = self.config['paths']['measures']

        input_table = create_hive_module_input_table(self.hive,
                                                     'edinet_baseline_input',
                                                     location, fields,
                                                     self.task_UUID)

        #add input table to be deleted after execution
        self.context.add_clean_hive_tables(input_table)
        qbr = RawQueryBuilder(self.hive)
        sentence = """
            INSERT OVERWRITE TABLE {input_table}
            SELECT a.deviceId, a.ts, a.value, a.energyType, a.source, c.temperature FROM
                (SELECT ai.deviceid as deviceId, ai.ts as ts, ai.value as value, ai.energyType as energyType, ai.source as source FROM edinet_hourly_consumption ai
                    WHERE
                        ai.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND
                        ai.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss")) a
                JOIN {device_stations} b on a.deviceId==b.deviceId
                JOIN  edinet_meteo c on b.stationId==c.stationId and SUBSTR(FROM_UNIXTIME(a.ts), 1, 13) == SUBSTR(FROM_UNIXTIME(c.ts), 1, 13)
                """.format(input_table=input_table,
                           ts_from=ts_from,
                           ts_to=ts_to,
                           device_stations=device_stations)

        self.logger.debug(sentence)
        qbr.execute_query(sentence)

        ######################################################################################################################################################################################
        """ SETUP MAP REDUCE JOB """
        ######################################################################################################################################################################################

        self.logger.info('Getting')
        try:
            # Launch MapReduce job
            ## Buffered measures to HBase
            self.logger.debug('MRJob Align')
            self.launcher_hadoop_job('align', location, result_companyId,
                                     device_key, stations)
        except Exception as e:
            raise Exception('MRJob ALIGN process job has failed: {}'.format(e))
        self.logger.info('Module EDINET_baseline execution finished...')
Exemplo n.º 5
0
    def module_task(self, params):
        self.logger.info('Starting Module for edinet baseline...')
        """CHECK INCONSISTENCIES IN params"""
        try:
            companyId = params['companyId'] if 'companyId' in params else None
            companyId_toJoin = params['companyId_toJoin'] if 'companyId_toJoin' in params and params[
                'companyId_toJoin'] else []
            buffer_size = params['buffer_size'] if 'buffer_size' in params else 1000000
            timezone = params['timezone'] if 'timezone' in params else 'Europe/Madrid'
            ts_to = params['ts_to']
            energyType = params['type']
            ts_from = params['ts_from'] if 'ts_from' in params else date_n_month(ts_to, -24)
            modellingUnits = params['modellingUnits'] if 'modellingUnits' in params else []
            debug = params['debug'] if 'debug' in params else None
            remove_tables = params['remove_tables'] if 'remove_tables' in params else True
        except KeyError as e:
            raise Exception('Not enough parameters provided to module: {}'.format(e))

        #####################################################################################################################################################################################
        """  LOAD from MONGO to HBASE  """
        ######################################################################################################################################################################################

        self.logger.info('Extracting data from mongodb')

        # set query dictionary
        query = {}
        if params['companyId']:
            query = {'companyId': params['companyId']}
        if modellingUnits:
            query['modellingUnitId'] = {'$in': modellingUnits}

        # set projection dictionary (1 means field returned, 0 field wont be returned)
        projection = {
            '_id': 0,
            '_updated': 0,
            '_created': 0
        }

        # setting variables for readability
        collection = self.config['mongodb']['modelling_units_collection']

        self.logger.debug('Querying for modelling units in MongoDB: %s' % query)
        cursor = self.mongo[collection].find(query, projection)

        device_key = {}
        stations = {}
        for item in cursor:
            if len(item['devices']) > 0:  # to avoid empty list of devices
                for dev in item['devices']:
                    stations[str(dev['deviceId'].encode('utf-8'))] = str(
                        item['stationId']) if 'stationId' in item else None
                    model = str(item['baseline']['model']) if 'baseline' in item and 'model' in item[
                        'baseline'] else 'Weekly30Min'
                    if str(dev['deviceId'].encode('utf-8')) in device_key.keys():
                        device_key[str(dev['deviceId'].encode('utf-8'))].append(
                            str(item['modellingUnitId']) + '~' + str(item['devices']) + '~' + model)
                    else:
                        device_key[str(dev['deviceId'].encode('utf-8'))] = [
                            str(item['modellingUnitId']) + '~' + str(item['devices']) + '~' + model]

        self.logger.info('A mongo query process has loaded {} devices'.format(len(device_key.keys())))

        ######################################################################################################################################################################################
        """ HIVE QUERY TO PREPARE DATA THAT HAS TO BE LOADED INTO MONGO """
        ######################################################################################################################################################################################
        ######################################################################################################################################################################################
        """ HIVE QUERY TO PREPARE DATA THAT HAS TO BE LOADED INTO MONGO """
        ######################################################################################################################################################################################
        # In the previous implementation, only energyType or companyId were taken into account to join the different tables.
        # In this proposal, we will join all tables from companyId and energyType to be the input of baseline module.

        # If energyType is made of a single element(not a list), create a list with this element. Moreover, the energy type is also added for each row
        if not isinstance(energyType, list):
            energyType = [energyType]
            # All companyId will be found in company_to_join
        companyId_toJoin.append(companyId)

        tables = []
        energyTypeList = []
        # Create temp tables with hbase data, add them to context_clean to be deleted after execution
        for i in range(len(energyType)):
            for j in range(len(companyId_toJoin)):
                try:
                    temp_table = create_measures_temp_table_edinet(self.hive, energyType[i], companyId_toJoin[j], self.task_UUID)
                    tables.append(temp_table)
                    self.context.add_clean_hive_tables(temp_table)
                    energyTypeList.append(energyType[i])
                except:
                    pass
        self.logger.debug(len(tables))


        fields = [('deviceId', 'string'), ('ts', 'int'), ('value', 'float'), ('accumulated', 'float'),
                  ('energyType', 'string')]

        location = self.config['paths']['measures']
        input_table = create_hive_module_input_table(self.hive, 'edinet_baseline_input',
                                                     location, fields, self.task_UUID)

        #add input table to be deleted after execution
        self.context.add_clean_hive_tables(input_table)
        qbr = RawQueryBuilder(self.hive, self.logger)
        sentence = """
            INSERT OVERWRITE TABLE {input_table}
            SELECT deviceId, ts, value, accumulated, energyType FROM
            ( """
        letter = ''.join(chr(ord('a') + i) for i in range(len(tables) + 1))
        text = []
        for index, tab in enumerate(tables):
            var = letter[index]
            energy_type = energyTypeList[index]
            text.append(""" SELECT {var}.key.deviceId, {var}.key.ts, {var}.value, {var}.accumulated, '{energy_type}' as energyType FROM {tab} {var}
                              WHERE
                                  {var}.key.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND
                                  {var}.key.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss") AND
                                  {var}.key.deviceId IN {devices}
                              """.format(var=var, energy_type=energy_type, tab=tab,
                                         ts_from="{ts_from}", ts_to="{ts_to}", devices="{devices}"))
        sentence += """UNION
                    """.join(text)
        sentence += """) unionResult """
        vars = {
            'input_table': input_table,
            'ts_to': ts_to,
            'ts_from': ts_from,
            'devices': tuple(device_key.keys()) if len(device_key.keys()) > 1 else "('" + ",".join(
                device_key.keys()) + "')"
        }

        self.logger.debug(sentence.format(**vars))
        qbr.execute_query(sentence.format(**vars))
        ######################################################################################################################################################################################
        """ SETUP MAP REDUCE JOB """
        ######################################################################################################################################################################################

        self.logger.info('Getting')
        try:
            # Launch MapReduce job
            ## Buffered measures to HBase
            self.logger.debug('MRJob Align')
            self.launcher_hadoop_job('align', location, companyId, device_key, stations)
        except Exception as e:
             raise Exception('MRJob ALIGN process job has failed: {}'.format(e))
        self.logger.info('Module EDINET_baseline execution finished...')