Python get_data_from_blob示例，dataproducts.util.utils.get_data_from_blob Python示例

示例#1

0

显示文件

文件： user_detail_report.py 项目： reshmi-nair/sunbird-data-products

    def init(self):
        result_loc = self.data_store_location.joinpath('location')
        for slug in self.states.split(","):
            slug = slug.strip()
            state_result_loc = result_loc.joinpath(slug)
            os.makedirs(state_result_loc, exist_ok=True)
            try:
                get_data_from_blob(state_result_loc.joinpath('validated-user-detail', '{}.csv'.format(slug)), is_private=self.is_private)
            except Exception as e:
                print("validated-user-detail not available for "+slug)
                continue
            try:
                get_data_from_blob(state_result_loc.joinpath('validated-user-detail-state', '{}.csv'.format(slug)), is_private=self.is_private)
            except Exception as e:
                print("validated-user-detail-state not available for "+slug)
            user_df = pd.read_csv(state_result_loc.joinpath('validated-user-detail', '{}.csv'.format(slug)))
            district_group = user_df.groupby('District name')
            os.makedirs(state_result_loc.joinpath('districts'), exist_ok=True)
            for district_name, user_data in user_df.groupby('District name'):
                user_data.to_csv(state_result_loc.joinpath('districts', district_name.lower()+".csv"), index=False)

            shutil.move(state_result_loc.joinpath('validated-user-detail-state', '{}.csv'.format(slug)),
                state_result_loc.joinpath('districts', 'validated-user-detail-state.csv'))
            shutil.make_archive(str(state_result_loc.joinpath('validated-user-detail', slug)),
                                'zip',
                                str(state_result_loc.joinpath('districts')))
            post_data_to_blob(state_result_loc.joinpath('validated-user-detail', '{}.zip'.format(slug)), is_private=self.is_private)

示例#2

0

显示文件

    def init(self):
        start_time_sec = int(round(time.time()))

        analysis_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
        result_loc = self.data_store_location.joinpath('district_reports')
        result_loc.mkdir(exist_ok=True)
        result_loc.joinpath(analysis_date.strftime("%Y-%m-%d")).mkdir(exist_ok=True)
        self.data_store_location.joinpath('config').mkdir(exist_ok=True)
        get_data_from_blob(result_loc.joinpath('slug_state_mapping.csv'))
        tenant_info = pd.read_csv(result_loc.joinpath('slug_state_mapping.csv'))
        get_data_from_blob(self.data_store_location.joinpath('config', 'diksha_config.json'))
        with open(self.data_store_location.joinpath('config', 'diksha_config.json'), 'r') as f:
            self.config = json.loads(f.read())
        for ind, row in tenant_info.iterrows():
            print(row['state'])
            result_loc.joinpath(row["slug"]).mkdir(exist_ok=True)
            if isinstance(row['state'], str):
                self.unique_users(result_loc_=result_loc.joinpath(row["slug"]), date_=analysis_date,
                             state_=row['state'])

        end_time_sec = int(round(time.time()))
        time_taken = end_time_sec - start_time_sec
        metrics = [
            {
                "metric": "timeTakenSecs",
                "value": time_taken
            },
            {
                "metric": "date",
                "value": analysis_date.strftime("%Y-%m-%d")
            }
        ]
        push_metric_event(metrics, "District Monthly Report")

示例#3

0

显示文件

文件： user_detail_report.py 项目： sknirmalkar89/sunbird-data-products

    def init(self):
        for slug in self.states.split(","):
            slug = slug.strip()
            try:
                get_data_from_blob(
                    self.data_store_location.joinpath('location', slug,
                                                      'user_detail.csv'))
            except Exception as e:
                print("user_detail.csv not available for " + slug)
                continue
            pdb.set_trace()
            user_df = pd.read_csv(
                self.data_store_location.joinpath('location', slug,
                                                  'user_detail.csv'))
            district_group = user_df.groupby('District name')
            os.makedirs(self.data_store_location.joinpath(
                'location', slug, 'districts'),
                        exist_ok=True)
            for district_name, user_data in user_df.groupby('District name'):
                user_data.to_csv(self.data_store_location.joinpath(
                    'location', slug, 'districts',
                    district_name.lower() + ".csv"),
                                 index=False)

            shutil.make_archive(
                str(
                    self.data_store_location.joinpath('location', slug,
                                                      'districts')), 'zip',
                str(
                    self.data_store_location.joinpath('location', slug,
                                                      'districts')))
            post_data_to_blob(
                self.data_store_location.joinpath('location', slug,
                                                  'districts.zip'))

示例#4

0

显示文件

文件： user_declaration_report.py 项目： reshmi-nair/sunbird-data-products

    def init(self):
        result_loc = self.data_store_location.joinpath('location')
        for slug in self.states.split(","):
            slug = slug.strip()
            state_result_loc = result_loc.joinpath(slug)
            os.makedirs(state_result_loc, exist_ok=True)
            try:
                get_data_from_blob(state_result_loc.joinpath(
                    'declared_user_detail', '{}.csv'.format(slug)),
                                   is_private=self.is_private)
            except Exception as e:
                print("declared_user_detail not available for " + slug)
                continue

            user_df = pd.read_csv(
                state_result_loc.joinpath('declared_user_detail',
                                          '{}.csv'.format(slug)))
            os.makedirs(state_result_loc.joinpath('personas'), exist_ok=True)
            for persona, user_data in user_df.groupby('Persona'):
                user_data.to_csv(state_result_loc.joinpath(
                    'personas',
                    persona.lower() + ".csv"),
                                 index=False)

            shutil.make_archive(
                str(state_result_loc.joinpath('declared_user_detail', slug)),
                'zip', str(state_result_loc.joinpath('personas')))
            post_data_to_blob(state_result_loc.joinpath(
                'declared_user_detail', '{}.zip'.format(slug)),
                              is_private=self.is_private)

示例#5

0

显示文件

文件： content_consumption.py 项目： reshmi-nair/sunbird-data-products

 def init(self):
     start_time_sec = int(round(time.time()))
     print("Content Consumption Report::Start")
     self.data_store_location = Path(self.data_store_location)
     org_search = self.org_search
     druid = self.druid_hostname
     druid_rollup = self.druid_rollup_hostname
     content_search = self.content_search
     execution_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
     result_loc = self.data_store_location.joinpath('content_plays')
     result_loc.parent.joinpath('portal_dashboards').mkdir(exist_ok=True)
     result_loc.parent.joinpath('config').mkdir(exist_ok=True)
     get_data_from_blob(
         result_loc.parent.joinpath('config', 'diksha_config.json'))
     with open(result_loc.parent.joinpath('config', 'diksha_config.json'),
               'r') as f:
         self.config = json.loads(f.read())
     get_tenant_info(result_loc_=result_loc,
                     org_search_=org_search,
                     date_=execution_date)
     print("Success::Tenant info")
     get_content_model(result_loc_=result_loc,
                       druid_=druid_rollup,
                       date_=execution_date,
                       config_=self.config,
                       version_='v2')
     print("Success::content model snapshot")
     get_tb_content_mapping(result_loc_=result_loc,
                            date_=execution_date,
                            content_search_=content_search)
     print("Success::TB Content Map")
     self.get_weekly_report(result_loc_=result_loc,
                            druid_rollup_=druid_rollup,
                            date_=execution_date,
                            config=self.config)
     print("Success::Weekly Conent Consumption")
     self.get_overall_report(result_loc_=result_loc,
                             druid_rollup_=druid_rollup,
                             date_=execution_date,
                             config=self.config)
     print("Success::Overall Conent Consumption")
     self.get_last_week_report(result_loc_=result_loc,
                               date_=execution_date,
                               num_weeks=6)
     print("Success:::Last 6 weeks")
     shutil.rmtree(result_loc)
     print("Content Consumption Report::Completed")
     end_time_sec = int(round(time.time()))
     time_taken = end_time_sec - start_time_sec
     metrics = [{
         "metric": "timeTakenSecs",
         "value": time_taken
     }, {
         "metric": "date",
         "value": execution_date.strftime("%Y-%m-%d")
     }]
     push_metric_event(metrics, "Content Consumption Metrics")

示例#6

0

显示文件

    def generate_reports(self, from_time, to_time):
        ecg_data = self.get_monitoring_data(from_time, to_time)

        findspark.init()
        spark = SparkSession.builder.appName("ECGLearning").master(
            "local[*]").getOrCreate()
        spark.conf.set('spark.sql.session.timeZone', 'Asia/Kolkata')
        os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True)

        # Create data frame
        ecg_data_rdd = spark.sparkContext.parallelize(ecg_data)
        schema = StructType([
            StructField('time', IntegerType(), True),
            StructField('tps', StringType(), True)
        ])
        tps_df = spark.createDataFrame(ecg_data_rdd, schema)
        tps_df = tps_df.withColumn("tps", tps_df["tps"].cast("float"))
        tps_df = tps_df.withColumn("tps", F.ceil(tps_df["tps"]))
        tps_df = tps_df.withColumn(
            "time", F.from_unixtime(tps_df["time"], "yyyy/MM/dd HH:mm:ss"))

        # Downloading the current file from blob container
        get_data_from_blob(
            Path(self.write_path).joinpath('public', self.csv_file_name))
        current_blob_df = spark.read.csv(os.path.join(self.write_path,
                                                      'public',
                                                      self.csv_file_name),
                                         header=True)
        current_blob_df = current_blob_df.withColumn(
            "tps", current_blob_df["tps"].cast("int"))
        current_blob_df = current_blob_df.union(tps_df)
        current_blob_df = current_blob_df.dropDuplicates(["time"])
        current_blob_df = current_blob_df.sort("time")

        # removing the first day's data on 7 days data
        current_blob_df = self.remove_last_day(current_blob_df)

        os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True)
        current_blob_df.toPandas().to_csv(os.path.join(self.write_path,
                                                       'public',
                                                       self.csv_file_name),
                                          index=False)
        create_json(
            os.path.join(self.write_path, 'public', self.csv_file_name), True)

        # Uploading updated data to Azure blob container
        write_data_to_blob(self.write_path,
                           os.path.join('public', self.csv_file_name))
        write_data_to_blob(self.write_path,
                           os.path.join('public', self.json_file_name))

        spark.stop()

示例#7

0

显示文件

文件： district_weekly.py 项目： reshmi-nair/sunbird-data-products

    def init(self):
        start_time_sec = int(round(time.time()))
        file_path = Path(__file__)
        result_loc = self.data_store_location.joinpath('district_reports')
        result_loc.mkdir(exist_ok=True)
        result_loc.parent.joinpath('config').mkdir(exist_ok=True)
        analysis_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
        get_data_from_blob(result_loc.joinpath('slug_state_mapping.csv'))
        get_location_info(result_loc, self.location_search, analysis_date)
        tenant_info = pd.read_csv(
            result_loc.joinpath('slug_state_mapping.csv'))
        self.druid_url = "{}druid/v2/".format(self.druid_hostname)
        self.headers = {'Content-Type': "application/json"}
        result_loc.parent.joinpath('config').mkdir(exist_ok=True)
        get_data_from_blob(
            result_loc.parent.joinpath('config', 'diksha_config.json'))
        with open(result_loc.parent.joinpath('config', 'diksha_config.json'),
                  'r') as f:
            self.config = json.loads(f.read())
        for ind, row in tenant_info.iterrows():
            state = row['state']
            print(state)
            result_loc.joinpath(
                analysis_date.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
            result_loc.joinpath(analysis_date.strftime('%Y-%m-%d'),
                                row['slug']).mkdir(exist_ok=True)
            path = result_loc.joinpath(analysis_date.strftime('%Y-%m-%d'),
                                       row['slug'])
            if isinstance(state, str):
                self.district_devices(result_loc_=path,
                                      date_=analysis_date,
                                      state_=state)
                self.district_plays(result_loc_=path,
                                    date_=analysis_date,
                                    state_=state)
                self.district_scans(result_loc_=path,
                                    date_=analysis_date,
                                    state_=state)
                self.merge_metrics(result_loc_=path, date_=analysis_date)

        end_time_sec = int(round(time.time()))
        time_taken = end_time_sec - start_time_sec
        metrics = [{
            "metric": "timeTakenSecs",
            "value": time_taken
        }, {
            "metric": "date",
            "value": analysis_date.strftime("%Y-%m-%d")
        }]
        push_metric_event(metrics, "District Weekly Report")

示例#8

0

显示文件

 def init(self):
     start_time_sec = int(round(time.time()))
     print("Content Consumption Report::Start")
     self.data_store_location = Path(self.data_store_location)
     org_search = self.org_search
     druid = self.druid_hostname
     cassandra = self.cassandra_host
     keyspace = self.keyspace_prefix + 'content_db'
     execution_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
     result_loc = self.data_store_location.joinpath('content_plays')
     result_loc.parent.joinpath('portal_dashboards').mkdir(exist_ok=True)
     result_loc.parent.joinpath('config').mkdir(exist_ok=True)
     get_data_from_blob(
         result_loc.parent.joinpath('config', 'diksha_config.json'))
     with open(result_loc.parent.joinpath('config', 'diksha_config.json'),
               'r') as f:
         self.config = json.loads(f.read())
     get_tenant_info(result_loc_=result_loc,
                     org_search_=org_search,
                     date_=execution_date)
     get_content_model(result_loc_=result_loc,
                       druid_=druid,
                       date_=execution_date)
     self.define_keyspace(cassandra_=cassandra, keyspace_=keyspace)
     for i in range(7):
         analysis_date = execution_date - timedelta(days=i)
         get_content_plays(result_loc_=result_loc,
                           date_=analysis_date,
                           druid_=druid)
         self.insert_data_to_cassandra(result_loc_=result_loc,
                                       date_=analysis_date,
                                       cassandra_=cassandra,
                                       keyspace_=keyspace)
     self.get_weekly_plays(result_loc_=result_loc,
                           date_=execution_date,
                           cassandra_=cassandra,
                           keyspace_=keyspace)
     print("Content Consumption Report::Completed")
     end_time_sec = int(round(time.time()))
     time_taken = end_time_sec - start_time_sec
     metrics = [{
         "metric": "timeTakenSecs",
         "value": time_taken
     }, {
         "metric": "date",
         "value": execution_date.strftime("%Y-%m-%d")
     }]
     push_metric_event(metrics, "Content Consumption Metrics")

示例#9

0

显示文件

文件： cmo_dashboard.py 项目： reshmi-nair/sunbird-data-products

    def init(self):
        start_time_sec = int(round(time.time()))
        print("START:CMO Dashboard")
        data_store_location = self.data_store_location.joinpath('portal_dashboards')
        data_store_location.mkdir(exist_ok=True)
        analysis_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
        data_store_location.joinpath('public').mkdir(exist_ok=True)
        get_data_from_blob(data_store_location.joinpath('overall', 'daily_metrics.csv'))
        self.data_wrangling(result_loc_=data_store_location.joinpath('overall', 'daily_metrics.csv'), date_=analysis_date)
        create_json(data_store_location.joinpath('public', 'cmo_dashboard.csv'), last_update=True)
        post_data_to_blob(data_store_location.joinpath('public', 'cmo_dashboard.csv'))
        get_tenant_info(result_loc_=data_store_location.parent.joinpath('textbook_reports'), org_search_=self.org_search,
                        date_=analysis_date)
        board_slug = pd.read_csv(
            data_store_location.parent.joinpath('textbook_reports', analysis_date.strftime('%Y-%m-%d'), 'tenant_info.csv'))
        slug_list = board_slug['slug'].unique().tolist()
        for slug in slug_list:
            try:
                get_data_from_blob(result_loc_=data_store_location.joinpath(slug, 'daily_metrics.csv'))
                self.data_wrangling(result_loc_=data_store_location.joinpath(slug, 'daily_metrics.csv'), date_=analysis_date)
                create_json(read_loc_=data_store_location.joinpath(slug, 'cmo_dashboard.csv'), last_update=True)
                post_data_to_blob(result_loc_=data_store_location.joinpath(slug, 'cmo_dashboard.csv'))
            except:
                pass
        print("END:CMO Dashboard")

        end_time_sec = int(round(time.time()))
        time_taken = end_time_sec - start_time_sec
        metrics = [
            {
                "metric": "timeTakenSecs",
                "value": time_taken
            },
            {
                "metric": "date",
                "value": analysis_date.strftime("%Y-%m-%d")
            }
        ]
        push_metric_event(metrics, "CMO Dashboard")

示例#10

0

显示文件

    def generate_report(self):
        board_slug = pd.read_csv(
                        self.data_store_location.joinpath('textbook_reports', self.current_time.strftime('%Y-%m-%d'), 'tenant_info.csv')
                    )[['id', 'slug']]
        board_slug.set_index('slug', inplace=True)
        result = {}

        for slug, value in board_slug.iterrows():
            try:
                print(slug)
                org_path = self.data_store_location.joinpath('portal_dashboards', slug)
                os.makedirs(org_path, exist_ok=True)

                get_data_from_blob(org_path.joinpath('daily_metrics.csv'))
                get_data_from_blob(org_path.joinpath('DCE_textbook_data.csv'))
                get_data_from_blob(org_path.joinpath('content_creation.csv'))
                dm_df = pd.read_csv(org_path.joinpath('daily_metrics.csv'))
                dm_df = dm_df.set_index('Date')
                dm_df.set_index(pd.to_datetime(dm_df.index, format='%d-%m-%Y'), inplace=True)
                _2018 = dm_df.loc[dm_df.index < '2019-06-01'].sum()[
                    ['Total QR scans', 'Total Content Downloads', 'Total Content Plays', 'Total Content Play Time (in hours)']]
                _2019 = dm_df.loc[dm_df.index >= '2019-06-01'].sum()[
                    ['Total QR scans', 'Total Content Downloads', 'Total Content Plays', 'Total Content Play Time (in hours)']]
                _2018.to_json(org_path.joinpath('landing_page_2018.json'))
                _2019.to_json(org_path.joinpath('landing_page_2019.json'))
                try:
                    dce_df = pd.read_csv(org_path.joinpath('DCE_textbook_data.csv'))
                    cc_df = pd.read_csv(org_path.joinpath('content_creation.csv'))
                    result = {
                        'no_of_textbooks': dce_df.shape[0],
                        'no_of_qr_codes': int(dce_df['Total number of QR codes'].sum()),
                        'no_of_resource': cc_df[cc_df['Status'] == 'live']['Status from the beginning'].values.tolist()[0]
                    }
                except:
                    result = {
                        'no_of_textbooks': 0,
                        'no_of_qr_codes': 0,
                        'no_of_resource': 0
                    }
                with open(str(org_path.joinpath('landing_page_creation_metrics.json')), 'w') as f:
                    json.dump(result, f)

                post_data_to_blob(org_path.joinpath('landing_page_2018.json'))
                post_data_to_blob(org_path.joinpath('landing_page_2019.json'))
                post_data_to_blob(org_path.joinpath('landing_page_creation_metrics.json'))
            except EmptyDataError:
                pass
            except AzureMissingResourceHttpError:
                pass

示例#11

0

显示文件

文件： district_weekly.py 项目： reshmi-nair/sunbird-data-products

 def merge_metrics(self, result_loc_, date_):
     """
     merge all the metrics
     :param result_loc_: pathlib.Path object to store resultant CSV at.
     :param date_: datetime object to be used in path
     :return: None
     """
     slug_ = result_loc_.name
     result_loc_.parent.parent.parent.joinpath("portal_dashboards").mkdir(
         exist_ok=True)
     last_sunday = datetime.strftime(date_ - timedelta(days=1), '%d/%m/%Y')
     try:
         devices_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_unique_devices.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         devices_df = pd.DataFrame([],
                                   columns=[
                                       'District', 'Platform',
                                       'Unique Devices'
                                   ]).set_index(['District', 'Platform'])
     try:
         plays_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_content_plays.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         plays_df = pd.DataFrame([],
                                 columns=[
                                     'District', 'Platform',
                                     'Number of Content Plays'
                                 ]).set_index(['District', 'Platform'])
     try:
         scans_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_qr_scans.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         scans_df = pd.DataFrame([],
                                 columns=[
                                     'District', 'Platform',
                                     'Number of QR Scans'
                                 ]).set_index(['District', 'Platform'])
     district_df = devices_df.join(scans_df, how='outer').join(
         plays_df, how='outer').reset_index().pivot(index='District',
                                                    columns='Platform')
     district_df = district_df.join(district_df.sum(level=0, axis=1))
     district_df.columns = [
         col[0] + ' on ' +
         col[1].split('.')[-1] if isinstance(col, tuple) else 'Total ' + col
         for col in district_df.columns
     ]
     district_df['Data as on Last day (Sunday) of the week'] = last_sunday
     district_df = district_df.reset_index()
     district_df.index = [
         pd.to_datetime(
             district_df['Data as on Last day (Sunday) of the week'],
             format='%d/%m/%Y'), district_df['District']
     ]
     for c in [
             'Unique Devices on portal', 'Unique Devices on app',
             'Total Unique Devices', 'Number of QR Scans on portal',
             'Number of QR Scans on app', 'Total Number of QR Scans',
             'Number of Content Plays on portal',
             'Number of Content Plays on app',
             'Total Number of Content Plays'
     ]:
         if c not in district_df.columns:
             district_df[c] = 0
     try:
         get_data_from_blob(
             result_loc_.parent.parent.parent.joinpath(
                 "portal_dashboards", slug_,
                 "aggregated_district_data.csv"))
         blob_data = pd.read_csv(
             result_loc_.parent.parent.parent.joinpath(
                 "portal_dashboards", slug_,
                 "aggregated_district_data.csv"))
         blob_data = blob_data[
             blob_data['Data as on Last day (Sunday) of the week'] !=
             last_sunday]
         blob_data.index = [
             pd.to_datetime(
                 blob_data['Data as on Last day (Sunday) of the week'],
                 format='%d/%m/%Y'), blob_data['District']
         ]
     except AzureMissingResourceHttpError:
         blob_data = pd.DataFrame()
     except FileNotFoundError:
         blob_data = pd.DataFrame()
     district_df = pd.concat([blob_data, district_df], sort=True)
     district_df = district_df.sort_index().drop_duplicates(
         subset=['Data as on Last day (Sunday) of the week', 'District'],
         keep='last').fillna(0)
     district_df = district_df[[
         'Data as on Last day (Sunday) of the week', 'District',
         'Unique Devices on app', 'Unique Devices on portal',
         'Total Unique Devices', 'Number of QR Scans on app',
         'Number of QR Scans on portal', 'Total Number of QR Scans',
         'Number of Content Plays on app',
         'Number of Content Plays on portal',
         'Total Number of Content Plays'
     ]]
     district_df.to_csv(result_loc_.parent.parent.parent.joinpath(
         "portal_dashboards", slug_, "aggregated_district_data.csv"),
                        index=False)
     create_json(
         result_loc_.parent.parent.parent.joinpath(
             "portal_dashboards", slug_, "aggregated_district_data.csv"))
     post_data_to_blob(
         result_loc_.parent.parent.parent.joinpath(
             "portal_dashboards", slug_, "aggregated_district_data.csv"))

示例#12

0

显示文件

文件： consumption_metrics.py 项目： reshmi-nair/sunbird-data-products

    def init(self):
        start_time_sec = int(round(time.time()))
        start_time = datetime.now()
        print("Started at: ", start_time.strftime('%Y-%m-%d %H:%M:%S'))
        findspark.init()
        execution_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
        analysis_date = execution_date - timedelta(1)

        self.data_store_location.joinpath('tb_metadata').mkdir(exist_ok=True)
        self.data_store_location.joinpath('play').mkdir(exist_ok=True)
        self.data_store_location.joinpath('downloads').mkdir(exist_ok=True)
        self.data_store_location.joinpath('dialcode_scans').mkdir(
            exist_ok=True)
        self.data_store_location.joinpath('portal_dashboards').mkdir(
            exist_ok=True)
        self.data_store_location.joinpath('config').mkdir(exist_ok=True)
        get_data_from_blob(
            self.data_store_location.joinpath('config', 'diksha_config.json'))
        with open(
                self.data_store_location.joinpath('config',
                                                  'diksha_config.json'),
                'r') as f:
            self.config = json.loads(f.read())
        get_textbook_snapshot(
            result_loc_=self.data_store_location.joinpath('tb_metadata'),
            content_search_=self.content_search,
            content_hierarchy_=self.content_hierarchy,
            date_=analysis_date)
        print('[Success] Textbook Snapshot')
        get_tenant_info(
            result_loc_=self.data_store_location.joinpath('textbook_reports'),
            org_search_=self.org_search,
            date_=analysis_date)
        print('[Success] Tenant Info')
        self.app_and_plays(
            result_loc_=self.data_store_location.joinpath('play'),
            date_=analysis_date)
        print('[Success] App and Plays')
        self.dialscans(
            result_loc_=self.data_store_location.joinpath('dialcode_scans'),
            date_=analysis_date)
        print('[Success] DIAL Scans')
        self.downloads(
            result_loc_=self.data_store_location.joinpath('downloads'),
            date_=analysis_date)
        print('[Success] Downloads')
        self.daily_metrics(read_loc_=self.data_store_location,
                           date_=analysis_date)
        print('[Success] Daily metrics')
        end_time = datetime.now()
        print("Ended at: ", end_time.strftime('%Y-%m-%d %H:%M:%S'))
        print("Time taken: ", str(end_time - start_time))

        end_time_sec = int(round(time.time()))
        time_taken = end_time_sec - start_time_sec
        metrics = [{
            "metric": "timeTakenSecs",
            "value": time_taken
        }, {
            "metric": "date",
            "value": execution_date.strftime("%Y-%m-%d")
        }]
        push_metric_event(metrics, "Consumption Metrics")

示例#13

0

显示文件

文件： consumption_metrics.py 项目： reshmi-nair/sunbird-data-products

    def daily_metrics(self, read_loc_, date_):
        """
        merge the three metrics
        :param read_loc_: pathlib.Path object to read CSV from.
        :param date_: datetime object to use in path
        :return: None
        """
        try:
            board_slug = \
                pd.read_csv(
                    self.data_store_location.joinpath('textbook_reports', date_.strftime('%Y-%m-%d'), 'tenant_info.csv'))[
                    ['id', 'slug']]
            board_slug.set_index('id', inplace=True)
        except Exception:
            raise Exception('Board Slug Error!')
        try:
            scans_df = pd.read_csv(
                read_loc_.joinpath('dialcode_scans',
                                   date_.strftime('%Y-%m-%d'),
                                   'dial_scans.csv')).fillna('')
            scans_df = scans_df.pivot(index='dialcode_channel',
                                      columns='failed_flag',
                                      values='count').reset_index().fillna(0)
            scans_df = scans_df.join(
                board_slug, on='dialcode_channel',
                how='left')[['slug', 'Failed QR Scans', 'Successful QR Scans']]
            scans_df['Total QR scans'] = scans_df[
                'Successful QR Scans'] + scans_df['Failed QR Scans']
            scans_df['Percentage (%) of Failed QR Scans'] = scans_df[
                'Failed QR Scans'] * 100 / scans_df['Total QR scans']
            unmapped = scans_df[scans_df.slug.isna()]['Total QR scans'][0]
            scans_df.dropna(subset=['slug'], inplace=True)
        except Exception as e:
            raise Exception('Scans Error! :: {}'.format(str(e)))
        try:
            downloads_df = pd.read_csv(
                read_loc_.joinpath('downloads', date_.strftime('%Y-%m-%d'),
                                   'downloads.csv'))
            downloads_df = downloads_df.fillna('').join(
                board_slug, on='channel',
                how='left')[['count', 'slug']].dropna(subset=['slug'])
            downloads_df.columns = ['Total Content Downloads', 'slug']
        except Exception:
            raise Exception('Downloads Error!')
        try:
            app_df = pd.read_csv(
                read_loc_.joinpath('play', date_.strftime('%Y-%m-%d'),
                                   'app_sessions.csv'))
            app_df = app_df[[
                'Total App Sessions', 'Total Devices on App',
                'Total Time on App (in hours)'
            ]]
            plays_df = pd.read_csv(read_loc_.joinpath(
                'play', date_.strftime('%Y-%m-%d'), 'plays.csv'),
                                   header=[0, 1],
                                   dtype={0: str})

            # Making the channel column as index with string type since the csv is in multiindex format
            plays_df.set_index(plays_df.columns[0], inplace=True)
            plays_df.index.names = ['channel']
            plays_df = plays_df[1:]

            plays_df = plays_df.reset_index().join(board_slug,
                                                   on='channel',
                                                   how='left')
            plays_df['Total Content Plays on App'] = plays_df.get(
                ('Total Content Plays',
                 self.config['context']['pdata']['id']['app']),
                pd.Series(index=plays_df.index,
                          name=('Total Content Plays',
                                self.config['context']['pdata']['id']['app'])))
            plays_df['Total Content Plays on Portal'] = plays_df.get(
                ('Total Content Plays',
                 self.config['context']['pdata']['id']['portal']),
                pd.Series(
                    index=plays_df.index,
                    name=('Total Content Plays',
                          self.config['context']['pdata']['id']['portal'])))
            plays_df[
                'Total Devices that played content on App'] = plays_df.get(
                    ('Total Devices that played content',
                     self.config['context']['pdata']['id']['app']),
                    pd.Series(
                        index=plays_df.index,
                        name=('Total Devices that played content',
                              self.config['context']['pdata']['id']['app'])))
            plays_df[
                'Total Devices that played content on Portal'] = plays_df.get(
                    ('Total Devices that played content',
                     self.config['context']['pdata']['id']['portal']),
                    pd.Series(
                        index=plays_df.index,
                        name=(
                            'Total Devices that played content',
                            self.config['context']['pdata']['id']['portal'])))
            plays_df['Content Play Time on App (in hours)'] = plays_df.get(
                ('Content Play Time (in hours)',
                 self.config['context']['pdata']['id']['app']),
                pd.Series(index=plays_df.index,
                          name=('Content Play Time (in hours)',
                                self.config['context']['pdata']['id']['app'])))
            plays_df['Content Play Time on Portal (in hours)'] = plays_df.get(
                ('Content Play Time (in hours)',
                 self.config['context']['pdata']['id']['portal']),
                pd.Series(
                    index=plays_df.index,
                    name=('Content Play Time (in hours)',
                          self.config['context']['pdata']['id']['portal'])))
            plays_df = plays_df[[
                'Total Content Plays on App', 'Total Content Plays on Portal',
                'Total Devices that played content on App',
                'Total Devices that played content on Portal',
                'Content Play Time on App (in hours)',
                'Content Play Time on Portal (in hours)', 'slug'
            ]].dropna(subset=['slug'])
        except Exception as e:
            raise Exception('App and Plays Error! :: {}'.format(str(e)))
        try:
            daily_metrics_df = scans_df.join(
                downloads_df.set_index('slug'), on='slug',
                how='outer').reset_index(drop=True).join(
                    plays_df.set_index('slug'),
                    on='slug',
                    how='outer',
                    rsuffix='_plays').fillna(0)
            daily_metrics_df['Date'] = '-'.join(
                date_.strftime('%Y-%m-%d').split('-')[::-1])
        except Exception:
            raise Exception('Daily Metrics Error!')
        try:
            overall = daily_metrics_df[[
                'Successful QR Scans', 'Failed QR Scans',
                'Total Content Downloads', 'Total Content Plays on App',
                'Total Content Plays on Portal',
                'Total Devices that played content on App',
                'Total Devices that played content on Portal',
                'Content Play Time on App (in hours)',
                'Content Play Time on Portal (in hours)'
            ]].sum().astype(int)
            overall['Total App Sessions'] = app_df['Total App Sessions'].loc[0]
            overall['Total Devices on App'] = app_df[
                'Total Devices on App'].loc[0]
            overall['Total Time on App (in hours)'] = app_df[
                'Total Time on App (in hours)'].loc[0]
            overall['Date'] = '-'.join(
                date_.strftime('%Y-%m-%d').split('-')[::-1])
            overall['Unmapped QR Scans'] = unmapped
            overall[
                'Total QR scans'] = overall['Successful QR Scans'] + overall[
                    'Failed QR Scans'] + overall['Unmapped QR Scans']
            overall['Percentage (%) of Failed QR Scans'] = '%.2f' % (
                overall['Failed QR Scans'] * 100 / overall['Total QR scans'])
            overall['Percentage (%) of Unmapped QR Scans'] = '%.2f' % (
                overall['Unmapped QR Scans'] * 100 / overall['Total QR scans'])
            overall['Total Content Plays'] = overall[
                'Total Content Plays on App'] + overall[
                    'Total Content Plays on Portal']
            overall['Total Devices that played content'] = overall[
                'Total Devices that played content on App'] + overall[
                    'Total Devices that played content on Portal']
            overall['Total Content Play Time (in hours)'] = overall[
                'Content Play Time on App (in hours)'] + overall[
                    'Content Play Time on Portal (in hours)']
            overall = overall[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Unmapped QR Scans',
                'Percentage (%) of Failed QR Scans',
                'Percentage (%) of Unmapped QR Scans',
                'Total Content Downloads', 'Total App Sessions',
                'Total Devices on App', 'Total Time on App (in hours)',
                'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            read_loc_.joinpath('portal_dashboards',
                               'overall').mkdir(exist_ok=True)
            read_loc_.joinpath('portal_dashboards',
                               'mhrd').mkdir(exist_ok=True)
            try:
                get_data_from_blob(
                    read_loc_.joinpath('portal_dashboards', 'overall',
                                       'daily_metrics.csv'))
                blob_data = pd.read_csv(
                    read_loc_.joinpath('portal_dashboards', 'overall',
                                       'daily_metrics.csv'))
            except:
                blob_data = pd.DataFrame()
            blob_data = blob_data.append(pd.DataFrame(overall).transpose(),
                                         sort=False).fillna('')
            blob_data.index = pd.to_datetime(blob_data.Date, format='%d-%m-%Y')
            blob_data.drop_duplicates('Date', inplace=True, keep='last')
            blob_data.sort_index(inplace=True)
            # can remove after first run
            blob_data = blob_data[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Unmapped QR Scans',
                'Percentage (%) of Failed QR Scans',
                'Percentage (%) of Unmapped QR Scans',
                'Total Content Downloads', 'Total App Sessions',
                'Total Devices on App', 'Total Time on App (in hours)',
                'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            blob_data.to_csv(read_loc_.joinpath('portal_dashboards', 'overall',
                                                'daily_metrics.csv'),
                             index=False)
            create_json(
                read_loc_.joinpath('portal_dashboards', 'overall',
                                   'daily_metrics.csv'))
            post_data_to_blob(
                read_loc_.joinpath('portal_dashboards', 'overall',
                                   'daily_metrics.csv'))
        except Exception:
            raise Exception('Overall Metrics Error!')
        try:
            daily_metrics_df['Total Content Plays'] = daily_metrics_df[
                'Total Content Plays on App'] + daily_metrics_df[
                    'Total Content Plays on Portal']
            daily_metrics_df['Total Devices that played content'] = daily_metrics_df[
                                                                        'Total Devices that played content on App'] + \
                                                                    daily_metrics_df[
                                                                        'Total Devices that played content on Portal']
            daily_metrics_df['Total Content Play Time (in hours)'] = daily_metrics_df[
                                                                         'Content Play Time on App (in hours)'] + \
                                                                     daily_metrics_df[
                                                                         'Content Play Time on Portal (in hours)']
            daily_metrics_df.set_index(['slug'], inplace=True)
            daily_metrics_df = daily_metrics_df[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Percentage (%) of Failed QR Scans',
                'Total Content Downloads', 'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            for slug, value in daily_metrics_df.iterrows():
                if slug != '':
                    read_loc_.joinpath('portal_dashboards',
                                       slug).mkdir(exist_ok=True)
                    for key, val in value.items():
                        if key not in [
                                'Date', 'Percentage (%) of Failed QR Scans'
                        ]:
                            value[key] = int(val)
                        elif key == 'Percentage (%) of Failed QR Scans':
                            value[key] = '%.2f' % val
                    try:
                        get_data_from_blob(
                            read_loc_.joinpath('portal_dashboards', slug,
                                               'daily_metrics.csv'))
                        blob_data = pd.read_csv(
                            read_loc_.joinpath('portal_dashboards', slug,
                                               'daily_metrics.csv'))
                    except:
                        blob_data = pd.DataFrame()
                    blob_data = blob_data.append(
                        pd.DataFrame(value).transpose(), sort=False).fillna('')
                    blob_data.index = pd.to_datetime(blob_data.Date,
                                                     format='%d-%m-%Y')
                    blob_data.drop_duplicates('Date',
                                              inplace=True,
                                              keep='last')
                    blob_data.sort_index(inplace=True)
                    # can remove after first run
                    blob_data = blob_data[[
                        'Date', 'Total QR scans', 'Successful QR Scans',
                        'Failed QR Scans', 'Percentage (%) of Failed QR Scans',
                        'Total Content Downloads',
                        'Total Content Plays on App',
                        'Total Devices that played content on App',
                        'Content Play Time on App (in hours)',
                        'Total Content Plays on Portal',
                        'Total Devices that played content on Portal',
                        'Content Play Time on Portal (in hours)',
                        'Total Content Plays',
                        'Total Devices that played content',
                        'Total Content Play Time (in hours)'
                    ]]
                    blob_data.to_csv(read_loc_.joinpath(
                        'portal_dashboards', slug, 'daily_metrics.csv'),
                                     index=False)
                    create_json(
                        read_loc_.joinpath('portal_dashboards', slug,
                                           'daily_metrics.csv'))
                    post_data_to_blob(
                        read_loc_.joinpath('portal_dashboards', slug,
                                           'daily_metrics.csv'))
        except Exception:
            raise Exception('State Metrics Error!')

示例#14

0

显示文件

    def generate_report(self):
        execution_date_str = datetime.strptime(self.execution_date,
                                               "%d/%m/%Y").strftime('%Y-%m-%d')
        week_last_date = (datetime.strptime(self.execution_date, "%d/%m/%Y") -
                          timedelta(1)).strftime('%d/%m/%Y')

        board_slug = pd.read_csv(
            self.data_store_location.joinpath(
                'textbook_reports', execution_date_str,
                'tenant_info.csv'))[['id', 'slug']]
        board_slug.set_index('slug', inplace=True)

        scans_df = pd.read_csv(
            self.data_store_location.joinpath('textbook_reports',
                                              execution_date_str,
                                              'weekly_dialcode_counts.csv'))
        scans_df["edata_filters_dialcodes"] = scans_df[
            "edata_filters_dialcodes"].str.upper().str.strip()
        scans_df = scans_df.groupby("edata_filters_dialcodes").agg({
            "Total Scans":
            "sum"
        }).reset_index()

        tb_dial_df = pd.read_csv(
            self.data_store_location.joinpath('tb_metadata',
                                              execution_date_str,
                                              'qr_code_state.csv'))
        tb_dial_df["QR"] = tb_dial_df["QR"].str.upper().str.strip()

        tb_dial_scans_df = pd.merge(scans_df,
                                    tb_dial_df,
                                    left_on="edata_filters_dialcodes",
                                    right_on="QR")
        tb_dial_scans_df['Index'] = tb_dial_scans_df['Index'].str.split(
            '.').str[0].astype(int)

        tb_dial_scans_df.groupby(["channel", "TB_ID", "Index"]).agg({
            "Total Scans":
            "sum"
        }).reset_index()
        tb_dial_scans_df['weighted_scans'] = tb_dial_scans_df[
            'Index'] * tb_dial_scans_df['Total Scans']

        weighted_avg_df = tb_dial_scans_df.groupby("channel").agg({
            "Total Scans":
            "sum",
            "weighted_scans":
            "sum"
        })

        weighted_avg_df['weighted_average'] = weighted_avg_df[
            'weighted_scans'] / weighted_avg_df['Total Scans']
        weighted_avg_df['weighted_average'] = weighted_avg_df[
            'weighted_average'].round(1)
        weighted_avg_df = weighted_avg_df.reset_index()[[
            'channel', 'weighted_average'
        ]]
        weighted_avg_df.rename(columns={"weighted_average": "Index"},
                               inplace=True)
        weighted_avg_df['Date'] = week_last_date

        for slug, board_value in board_slug.iterrows():
            print(slug)
            try:
                get_data_from_blob(
                    self.data_store_location.joinpath("portal_dashboards",
                                                      slug,
                                                      "gps_learning.csv"))
                blob_data = pd.read_csv(
                    self.data_store_location.joinpath("portal_dashboards",
                                                      slug,
                                                      "gps_learning.csv"))
            except Exception:
                blob_data = pd.DataFrame(columns=["Date", "Index"])

            current_channel_df = weighted_avg_df[weighted_avg_df['channel'] ==
                                                 board_value.id][[
                                                     "Date", "Index"
                                                 ]]

            blob_data = pd.concat([blob_data, current_channel_df])
            blob_data.drop_duplicates(subset=['Date'],
                                      keep='last',
                                      inplace=True)
            blob_data.to_csv(self.data_store_location.joinpath(
                'portal_dashboards', slug, 'gps_learning.csv'),
                             index=False)
            create_json(
                self.data_store_location.joinpath('portal_dashboards', slug,
                                                  'gps_learning.csv'))
            post_data_to_blob(
                self.data_store_location.joinpath('portal_dashboards', slug,
                                                  'gps_learning.csv'))

示例#15

0

显示文件

 def get_weekly_plays(self, result_loc_, date_, cassandra_, keyspace_):
     """
     query cassandra table for 1 week of content play and timespent.
     :param result_loc_: local path to store resultant csv
     :param date_: datetime object to pass to file path
     :param cassandra_: ip of the cassandra cluster
     :param keyspace_: keyspace in which we are working
     :return: None
     """
     tenant_info = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'tenant_info.csv'))[['id', 'slug']]
     tenant_info['id'] = tenant_info['id'].astype(str)
     tenant_info.set_index('id', inplace=True)
     cluster = Cluster([cassandra_])
     session = cluster.connect()
     start_date = date_ - timedelta(days=7)
     fetch_query = Template("""
     SELECT content_id, period, pdata_id, metric FROM $keyspace.content_aggregates WHERE 
     period >= $start_date AND 
     period < $end_date
     ALLOW FILTERING
     """)
     result = session.execute(
         fetch_query.substitute(keyspace=keyspace_,
                                start_date=start_date.strftime('%Y%m%d'),
                                end_date=date_.strftime('%Y%m%d')))
     df_dict = {}
     for row in result:
         if row.content_id in df_dict.keys():
             pass
         else:
             df_dict[row.content_id] = {
                 'identifier': row.content_id,
                 'Number of Plays on App': 0,
                 'Number of Plays on Portal': 0,
                 'Timespent on App': 0,
                 'Timespent on Portal': 0
             }
         pdata_id = 'App' if row.pdata_id == self.config['context']['pdata']['id']['app'] else 'Portal' if \
             row.pdata_id == self.config['context']['pdata']['id']['portal'] else 'error'
         df_dict[row.content_id]['Number of Plays on ' +
                                 pdata_id] += row.metric['plays']
         df_dict[row.content_id]['Timespent on ' +
                                 pdata_id] = row.metric['timespent']
     temp = []
     for k, v in df_dict.items():
         temp.append(v)
     df = pd.DataFrame(temp)
     df['Total No of Plays (App and Portal)'] = df[
         'Number of Plays on App'] + df['Number of Plays on Portal']
     df['Average Play Time in mins on App'] = round(
         df['Timespent on App'] / (60 * df['Number of Plays on App']), 2)
     df['Average Play Time in mins on Portal'] = round(
         df['Timespent on Portal'] / (60 * df['Number of Plays on Portal']),
         2)
     df['Average Play Time in mins (On App and Portal)'] = round(
         (df['Timespent on App'] + df['Timespent on Portal']) /
         (60 * df['Total No of Plays (App and Portal)']), 2)
     df = df[[
         'identifier', 'Total No of Plays (App and Portal)',
         'Number of Plays on App', 'Number of Plays on Portal',
         'Average Play Time in mins (On App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal'
     ]]
     content_model = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_model_snapshot.csv'))[[
                                  'channel', 'board', 'medium',
                                  'gradeLevel', 'subject', 'identifier',
                                  'name', 'mimeType', 'createdOn',
                                  'creator', 'lastPublishedOn',
                                  'me_averageRating'
                              ]]
     content_model["creator"] = content_model["creator"].str.replace(
         "null", "")
     content_model['channel'] = content_model['channel'].astype(str)
     content_model['mimeType'] = content_model['mimeType'].apply(
         self.mime_type)
     content_model.columns = [
         'channel', 'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
         'Content Name', 'Mime Type', 'Created On', 'Creator (User Name)',
         'Last Published On', 'Average Rating(out of 5)'
     ]
     content_model['Content ID'] = content_model['Content ID'].str.replace(
         ".img", "")
     content_model['Created On'] = content_model['Created On'].fillna(
         'T').apply(lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     content_model['Last Published On'] = content_model[
         'Last Published On'].fillna('T').apply(
             lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     # content_model['Last Updated On'] = content_model['Last Updated On'].fillna('T').apply(
     #     lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     df = content_model.join(df.set_index('identifier'),
                             on='Content ID',
                             how='left')
     df['Last Date of the week'] = (date_ -
                                    timedelta(days=1)).strftime('%d-%m-%Y')
     df['Total No of Plays (App and Portal)'] = df[
         'Total No of Plays (App and Portal)'].fillna(0)
     df['Number of Plays on App'] = df['Number of Plays on App'].fillna(0)
     df['Number of Plays on Portal'] = df[
         'Number of Plays on Portal'].fillna(0)
     df['Average Play Time in mins (On App and Portal)'] = df[
         'Average Play Time in mins (On App and Portal)'].fillna(0)
     df['Average Play Time in mins on App'] = df[
         'Average Play Time in mins on App'].fillna(0)
     df['Average Play Time in mins on Portal'] = df[
         'Average Play Time in mins on Portal'].fillna(0)
     df = df.fillna('Unknown')
     df.sort_values(inplace=True,
                    ascending=[1, 1, 1, 1, 1, 0],
                    by=[
                        'channel', 'Board', 'Medium', 'Grade', 'Subject',
                        'Total No of Plays (App and Portal)'
                    ])
     df.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                    'weekly_plays.csv'),
               index=False)
     post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                            'weekly_plays.csv'),
                       backup=True)
     for channel in df.channel.unique():
         try:
             slug = tenant_info.loc[channel]['slug']
             print(slug)
         except KeyError:
             continue
         content_aggregates = df[df['channel'] == channel]
         content_aggregates.drop(['channel'], axis=1, inplace=True)
         try:
             get_data_from_blob(
                 result_loc_.parent.joinpath('portal_dashboards', slug,
                                             'content_aggregates.csv'))
             blob_data = pd.read_csv(
                 result_loc_.parent.joinpath('portal_dashboards', slug,
                                             'content_aggregates.csv'))
         except AzureMissingResourceHttpError:
             blob_data = pd.DataFrame()
         except FileNotFoundError:
             blob_data = pd.DataFrame()
         content_aggregates = content_aggregates.append(
             blob_data).drop_duplicates(
                 subset=['Content ID', 'Last Date of the week'],
                 keep='first')
         content_aggregates = content_aggregates[[
             'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
             'Content Name', 'Mime Type', 'Created On',
             'Creator (User Name)', 'Last Published On',
             'Total No of Plays (App and Portal)', 'Number of Plays on App',
             'Number of Plays on Portal',
             'Average Play Time in mins (On App and Portal)',
             'Average Play Time in mins on App',
             'Average Play Time in mins on Portal',
             'Average Rating(out of 5)', 'Last Date of the week'
         ]]
         result_loc_.parent.joinpath('portal_dashboards',
                                     slug).mkdir(exist_ok=True)
         content_aggregates.to_csv(result_loc_.parent.joinpath(
             'portal_dashboards', slug, 'content_aggregates.csv'),
                                   index=False,
                                   encoding='utf-8-sig')
         create_json(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregates.csv'))
         post_data_to_blob(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregates.csv'))