def init(self):
        for slug in self.states.split(","):
            slug = slug.strip()
            try:
                get_data_from_blob(
                    self.data_store_location.joinpath('location', slug,
                                                      'user_detail.csv'))
            except Exception as e:
                print("user_detail.csv not available for " + slug)
                continue
            pdb.set_trace()
            user_df = pd.read_csv(
                self.data_store_location.joinpath('location', slug,
                                                  'user_detail.csv'))
            district_group = user_df.groupby('District name')
            os.makedirs(self.data_store_location.joinpath(
                'location', slug, 'districts'),
                        exist_ok=True)
            for district_name, user_data in user_df.groupby('District name'):
                user_data.to_csv(self.data_store_location.joinpath(
                    'location', slug, 'districts',
                    district_name.lower() + ".csv"),
                                 index=False)

            shutil.make_archive(
                str(
                    self.data_store_location.joinpath('location', slug,
                                                      'districts')), 'zip',
                str(
                    self.data_store_location.joinpath('location', slug,
                                                      'districts')))
            post_data_to_blob(
                self.data_store_location.joinpath('location', slug,
                                                  'districts.zip'))
    def init(self):
        result_loc = self.data_store_location.joinpath('location')
        for slug in self.states.split(","):
            slug = slug.strip()
            state_result_loc = result_loc.joinpath(slug)
            os.makedirs(state_result_loc, exist_ok=True)
            try:
                get_data_from_blob(state_result_loc.joinpath(
                    'declared_user_detail', '{}.csv'.format(slug)),
                                   is_private=self.is_private)
            except Exception as e:
                print("declared_user_detail not available for " + slug)
                continue

            user_df = pd.read_csv(
                state_result_loc.joinpath('declared_user_detail',
                                          '{}.csv'.format(slug)))
            os.makedirs(state_result_loc.joinpath('personas'), exist_ok=True)
            for persona, user_data in user_df.groupby('Persona'):
                user_data.to_csv(state_result_loc.joinpath(
                    'personas',
                    persona.lower() + ".csv"),
                                 index=False)

            shutil.make_archive(
                str(state_result_loc.joinpath('declared_user_detail', slug)),
                'zip', str(state_result_loc.joinpath('personas')))
            post_data_to_blob(state_result_loc.joinpath(
                'declared_user_detail', '{}.zip'.format(slug)),
                              is_private=self.is_private)
    def init(self):
        result_loc = self.data_store_location.joinpath('location')
        for slug in self.states.split(","):
            slug = slug.strip()
            state_result_loc = result_loc.joinpath(slug)
            os.makedirs(state_result_loc, exist_ok=True)
            try:
                get_data_from_blob(state_result_loc.joinpath('validated-user-detail', '{}.csv'.format(slug)), is_private=self.is_private)
            except Exception as e:
                print("validated-user-detail not available for "+slug)
                continue
            try:
                get_data_from_blob(state_result_loc.joinpath('validated-user-detail-state', '{}.csv'.format(slug)), is_private=self.is_private)
            except Exception as e:
                print("validated-user-detail-state not available for "+slug)
            user_df = pd.read_csv(state_result_loc.joinpath('validated-user-detail', '{}.csv'.format(slug)))
            district_group = user_df.groupby('District name')
            os.makedirs(state_result_loc.joinpath('districts'), exist_ok=True)
            for district_name, user_data in user_df.groupby('District name'):
                user_data.to_csv(state_result_loc.joinpath('districts', district_name.lower()+".csv"), index=False)

            shutil.move(state_result_loc.joinpath('validated-user-detail-state', '{}.csv'.format(slug)),
                state_result_loc.joinpath('districts', 'validated-user-detail-state.csv'))
            shutil.make_archive(str(state_result_loc.joinpath('validated-user-detail', slug)),
                                'zip',
                                str(state_result_loc.joinpath('districts')))
            post_data_to_blob(state_result_loc.joinpath('validated-user-detail', '{}.zip'.format(slug)), is_private=self.is_private)
 def get_last_week_report(result_loc_, date_, num_weeks):
     """
     fetch last n weekly reports from storage and zip folder
     :param result_loc_: pathlib.Path() object to store resultant csv at
     :param date_: execution date
     :param num_weeks: number of weeks to fetch report for
     :return: None
     """
     result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                          'content_consumption').mkdir(exist_ok=True)
     for i in range(num_weeks):
         last_week = date_ - timedelta(days=((i + 1) * 7))
         result_loc_.joinpath(
             last_week.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
         get_batch_data_from_blob(result_loc_=result_loc_.joinpath(
             date_.strftime('%Y-%m-%d'), 'content_consumption'),
                                  prefix_=result_loc_.name + '/' +
                                  last_week.strftime('%Y-%m-%d'),
                                  backup=True)
     for slug in result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                      'content_consumption').iterdir():
         if slug.is_dir():
             shutil.make_archive(
                 str(
                     result_loc_.parent.joinpath('portal_dashboards',
                                                 slug.name,
                                                 'content_consumption')),
                 'zip', str(slug))
             post_data_to_blob(
                 result_loc_.parent.joinpath('portal_dashboards', slug.name,
                                             'content_consumption.zip'))
 def unique_users(self, result_loc_, date_, state_):
     """
     Query druid for unique users by district over a month for a state
     :param result_loc_: pathlib.Path object to store resultant CSV
     :param date_: datetime object to pass for query and path
     :param query_: json query template
     :param state_: the state to be used in query
     :return: None
     """
     slug_ = result_loc_.name
     year = date_.year
     month = date_.month
     if month != 1:
         start_date = datetime(year, month - 1, 1)
     else:
         start_date = datetime(year - 1, 12, 1)
     query = Template(district_devices_monthly.init())
     query = query.substitute(
         app=self.config['context']['pdata']['id']['app'],
         portal=self.config['context']['pdata']['id']['portal'],
         state=state_,
         start_date=start_date.strftime('%Y-%m-%dT00:00:00+00:00'),
         end_date=date_.strftime('%Y-%m-%dT00:00:00+00:00'))
     url = "{}druid/v2/".format(self.druid_hostname)
     headers = {'Content-Type': "application/json"}
     response = requests.request("POST", url, data=query, headers=headers)
     if response.status_code == 200:
         if len(response.json()) == 0:
             return
         data = []
         for response in response.json():
             data.append(response['event'])
         df = pd.DataFrame(data).fillna('Unknown')
         df.to_csv(result_loc_.parent.joinpath(
             date_.strftime("%Y-%m-%d"), "{}_monthly.csv".format(slug_)),
                   index=False)
         post_data_to_blob(result_loc_.parent.joinpath(
             date_.strftime("%Y-%m-%d"), "{}_monthly.csv".format(slug_)),
                           backup=True)
         df['District'] = df.get('District',
                                 pd.Series(index=df.index, name='District'))
         df['Unique Devices'] = df['Unique Devices'].astype(int)
         df = verify_state_district(
             result_loc_.parent.joinpath(date_.strftime('%Y-%m-%d')),
             state_, df)
         df = df[['District', 'Unique Devices']]
         df = df.groupby('District').sum().reset_index()
         df.to_csv(
             result_loc_.joinpath("aggregated_unique_users_summary.csv"),
             index=False)
         create_json(
             result_loc_.joinpath("aggregated_unique_users_summary.csv"))
         post_data_to_blob(
             result_loc_.joinpath("aggregated_unique_users_summary.csv"))
     else:
         with open(result_loc_.parent.joinpath('error_log.log'), 'a') as f:
             f.write(state_ + 'summary ' + response.status_code +
                     response.text)
    def district_devices(self, result_loc_, date_, state_):
        """
        compute unique devices for a state over a week
        :param result_loc_: pathlib.Path object to store resultant CSV at.
        :param date_: datetime object to use for query and path
        :param state_: state to be used in query
        :return: None
        """
        slug_ = result_loc_.name
        start_date = date_ - timedelta(days=7)
        query = Template(district_devices.init())
        query = query.substitute(
            app=self.config['context']['pdata']['id']['app'],
            portal=self.config['context']['pdata']['id']['portal'],
            state=state_,
            start_date=datetime.strftime(start_date,
                                         '%Y-%m-%dT00:00:00+00:00'),
            end_date=datetime.strftime(date_, '%Y-%m-%dT00:00:00+00:00'))
        response = requests.request("POST",
                                    self.druid_url,
                                    data=query,
                                    headers=self.headers)
        if response.status_code == 200:
            if len(response.json()) == 0:
                return
            data = []
            for response in response.json():
                data.append(response['event'])
            df = pd.DataFrame(data)
            df['District'] = df.get('District',
                                    pd.Series(index=df.index, name='District'))
            df = verify_state_district(result_loc_.parent, state_, df)
            df = df.fillna('Unknown')
            df = df.groupby(['District', "Platform"]).sum().reset_index()
            df.to_csv(result_loc_.parent.joinpath(
                "{}_district_devices.csv".format(slug_)),
                      index=False)
            post_data_to_blob(result_loc_.parent.joinpath(
                "{}_district_devices.csv".format(slug_)),
                              backup=True)
            df['Unique Devices'] = df['Unique Devices'].astype(int)

            df = df.groupby(['District', 'Platform']).sum().reset_index()

            df = df[['District', 'Platform', 'Unique Devices']]
            df.to_csv(
                result_loc_.joinpath("aggregated_district_unique_devices.csv"),
                index=False)
        else:
            with open(result_loc_.joinpath('error_log.log'), 'a') as f:
                f.write(state_ + 'devices ' + str(response.status_code) +
                        response.text)
            exit(1)
예제 #7
0
 def combine_creation_reports(self, result_loc_, date_):
     """
     Combine weekly and overall numbers.
     :param result_loc_: pathlib.Path object to store the resultant csv at.
     :param date_: datetime object to use in query as well as path
     :return: None
     """
     tenant_name_mapping = pd.read_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'), 'tenant_info.csv'))
     _start_date = date_ - timedelta(days=7)
     week = pd.read_csv(result_loc_.joinpath(date_.strftime(format('%Y-%m-%d')), 'week.csv'))
     overall = pd.read_csv(result_loc_.joinpath(date_.strftime(format('%Y-%m-%d')), 'overall.csv'))
     while True:
         try:
             week['total'] = week['draft'] + week['live'] + week['review']
             break
         except KeyError as ke:
             week[ke.args[0]] = 0
     while True:
         try:
             overall['total'] = overall['draft'] + overall['live'] + overall['review']
             break
         except KeyError as ke:
             overall[ke.args[0]] = 0
     week = week.set_index('tenant')
     overall = overall.set_index('tenant')
     week_transpose = week.transpose()
     overall_transpose = overall.transpose()
     for ind_, row_ in tenant_name_mapping.iterrows():
         try:
             week_numbers = week_transpose[row_['id']]
             overall_numbers = overall_transpose[row_['id']]
             final_df = pd.concat([week_numbers, overall_numbers], axis=1)
             final_df.index.name = 'Content Status'
             final_df.columns = ['Week starting {}'.format(_start_date.strftime('%d %B')),
                                 'As on {}'.format(date_.strftime('%d %B'))]
             final_df.to_csv(
                 result_loc_.joinpath(date_.strftime('%Y-%m-%d'), '{}_Content_Status.csv'.format(row_['slug'])))
             final_df.index.name = 'Status'
             final_df.columns = ['Status over last week: starting {}'.format(_start_date.strftime('%d %B')),
                                 'Status from the beginning']
             result_loc_.parent.joinpath('portal_dashboards', row_['slug']).mkdir(exist_ok=True)
             final_df.to_csv(result_loc_.parent.joinpath('portal_dashboards', row_['slug'], 'content_creation.csv'))
             create_json(result_loc_.parent.joinpath('portal_dashboards', row_['slug'], 'content_creation.csv'))
             post_data_to_blob(result_loc_.parent.joinpath('portal_dashboards', row_['slug'], 'content_creation.csv'))
         except KeyError as ke:
             print(row_['id'], ke)
    def downloads(self, result_loc_, date_):
        """
        Compute daily content downloads by channel
        :param result_loc_: pathlib.Path object to store resultant CSV at.
        :param date_: datetime object to pass in query and path
        :return: None
        """
        end_date = date_ + timedelta(days=1)
        query = Template(content_downloads.init())
        query = query.substitute(
            app=self.config['context']['pdata']['id']['app'],
            start_date=datetime.strftime(date_, '%Y-%m-%dT00:00:00+00:00'),
            end_date=datetime.strftime(end_date, '%Y-%m-%dT00:00:00+00:00'))

        headers = {'Content-Type': "application/json"}
        url = "{}druid/v2/".format(self.druid_hostname)
        response = requests.request("POST", url, data=query, headers=headers)
        records = [events['event'] for events in response.json()]
        data = pd.DataFrame(records)
        content = pd.read_csv(
            str(
                result_loc_.parent.joinpath('tb_metadata',
                                            date_.strftime('%Y-%m-%d'),
                                            'textbook_snapshot.csv')))
        content = content[content['contentType'] == 'Resource']
        content = content[['identifier', 'channel']]
        content.drop_duplicates(inplace=True)
        content.rename(columns={'identifier': 'object_id'}, inplace=True)
        data = data.merge(content, on="object_id", how="left")
        data = data[data['channel'].notnull()]
        download_counts = data.groupby('channel').sum()
        download_counts.reset_index(inplace=True)

        result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
        download_counts.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                                    'downloads.csv'),
                               index=False)
        post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                               'downloads.csv'),
                          backup=True)
    def init(self):
        start_time_sec = int(round(time.time()))
        print("START:CMO Dashboard")
        data_store_location = self.data_store_location.joinpath('portal_dashboards')
        data_store_location.mkdir(exist_ok=True)
        analysis_date = datetime.strptime(self.execution_date, "%d/%m/%Y")
        data_store_location.joinpath('public').mkdir(exist_ok=True)
        get_data_from_blob(data_store_location.joinpath('overall', 'daily_metrics.csv'))
        self.data_wrangling(result_loc_=data_store_location.joinpath('overall', 'daily_metrics.csv'), date_=analysis_date)
        create_json(data_store_location.joinpath('public', 'cmo_dashboard.csv'), last_update=True)
        post_data_to_blob(data_store_location.joinpath('public', 'cmo_dashboard.csv'))
        get_tenant_info(result_loc_=data_store_location.parent.joinpath('textbook_reports'), org_search_=self.org_search,
                        date_=analysis_date)
        board_slug = pd.read_csv(
            data_store_location.parent.joinpath('textbook_reports', analysis_date.strftime('%Y-%m-%d'), 'tenant_info.csv'))
        slug_list = board_slug['slug'].unique().tolist()
        for slug in slug_list:
            try:
                get_data_from_blob(result_loc_=data_store_location.joinpath(slug, 'daily_metrics.csv'))
                self.data_wrangling(result_loc_=data_store_location.joinpath(slug, 'daily_metrics.csv'), date_=analysis_date)
                create_json(read_loc_=data_store_location.joinpath(slug, 'cmo_dashboard.csv'), last_update=True)
                post_data_to_blob(result_loc_=data_store_location.joinpath(slug, 'cmo_dashboard.csv'))
            except:
                pass
        print("END:CMO Dashboard")

        end_time_sec = int(round(time.time()))
        time_taken = end_time_sec - start_time_sec
        metrics = [
            {
                "metric": "timeTakenSecs",
                "value": time_taken
            },
            {
                "metric": "date",
                "value": analysis_date.strftime("%Y-%m-%d")
            }
        ]
        push_metric_event(metrics, "CMO Dashboard")
    def dialscans(self, result_loc_, date_):
        """
        compute failed/successful scans by channel
        :param result_loc_: pathlib.Path object to store resultant CSV at.
        :param date_: datetime object to use in query and path
        :return: None
        """
        end_date = date_ + timedelta(days=1)
        query = Template(dialcode_scans.init())
        query = query.substitute(
            app=self.config['context']['pdata']['id']['app'],
            portal=self.config['context']['pdata']['id']['portal'],
            start_date=datetime.strftime(date_, '%Y-%m-%dT00:00:00+00:00'),
            end_date=datetime.strftime(end_date, '%Y-%m-%dT00:00:00+00:00'))

        headers = {'Content-Type': "application/json"}
        url = "{}druid/v2/".format(self.druid_hostname)
        response = requests.request("POST", url, data=query, headers=headers)
        result = response.json()
        records = [events['event'] for events in result]
        data = pd.DataFrame(records)
        data['dialcode_channel'] = data.get(
            'dialcode_channel',
            pd.Series(index=data.index, name='dialcode_channel'))
        data['dialcode_channel'] = data['dialcode_channel'].fillna("")
        data['failed_flag'] = pd.np.where(data['edata_size'].astype(int) > 0,
                                          'Successful QR Scans',
                                          'Failed QR Scans')
        df = data.groupby(['dialcode_channel', 'failed_flag']).sum()
        df = df.reset_index()[['dialcode_channel', 'failed_flag', 'count']]
        result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
        df.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                       'dial_scans.csv'),
                  index=False)
        post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                               'dial_scans.csv'),
                          backup=True)
 def dce_aggregates(self, result_loc_, slug, df):
     """
     generate charts from DCE textbook data.
     :param result_loc_: pathlib.Path object with path to store resultant CSVs.
     :param slug: slug name for channel
     :param df: DCE textbook dataframe for the channel
     :return: None
     """
     qr_linked = df[[
         'Number of QR codes with atleast 1 linked content',
         'Number of QR codes with no linked content'
     ]].sum()
     qr_linked.index = ['QR Code With Content', 'QR Code Without Content']
     qr_linked = pd.DataFrame(qr_linked).reset_index()
     qr_linked.columns = ['Status', 'Count']
     qr_linked.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                           'dce_qr_content_status.csv'),
                      index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status.csv'))
     qr_linked_by_grade = df.groupby('Grade')[[
         'Number of QR codes with atleast 1 linked content',
         'Number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linked_by_grade.columns = [
         'Grade', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linked_by_grade = self.grade_fix(qr_linked_by_grade)
     qr_linked_by_grade.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'dce_qr_content_status_grade.csv'),
                               index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_grade.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_grade.csv'))
     qr_linked_by_subject = df.groupby('Subject')[[
         'Number of QR codes with atleast 1 linked content',
         'Number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linked_by_subject.columns = [
         'Subject', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linked_by_subject.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'dce_qr_content_status_subject.csv'),
                                 index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_subject.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'dce_qr_content_status_subject.csv'))
예제 #12
0
    def generate_report(self):
        board_slug = pd.read_csv(
                        self.data_store_location.joinpath('textbook_reports', self.current_time.strftime('%Y-%m-%d'), 'tenant_info.csv')
                    )[['id', 'slug']]
        board_slug.set_index('slug', inplace=True)
        result = {}

        for slug, value in board_slug.iterrows():
            try:
                print(slug)
                org_path = self.data_store_location.joinpath('portal_dashboards', slug)
                os.makedirs(org_path, exist_ok=True)

                get_data_from_blob(org_path.joinpath('daily_metrics.csv'))
                get_data_from_blob(org_path.joinpath('DCE_textbook_data.csv'))
                get_data_from_blob(org_path.joinpath('content_creation.csv'))
                dm_df = pd.read_csv(org_path.joinpath('daily_metrics.csv'))
                dm_df = dm_df.set_index('Date')
                dm_df.set_index(pd.to_datetime(dm_df.index, format='%d-%m-%Y'), inplace=True)
                _2018 = dm_df.loc[dm_df.index < '2019-06-01'].sum()[
                    ['Total QR scans', 'Total Content Downloads', 'Total Content Plays', 'Total Content Play Time (in hours)']]
                _2019 = dm_df.loc[dm_df.index >= '2019-06-01'].sum()[
                    ['Total QR scans', 'Total Content Downloads', 'Total Content Plays', 'Total Content Play Time (in hours)']]
                _2018.to_json(org_path.joinpath('landing_page_2018.json'))
                _2019.to_json(org_path.joinpath('landing_page_2019.json'))
                try:
                    dce_df = pd.read_csv(org_path.joinpath('DCE_textbook_data.csv'))
                    cc_df = pd.read_csv(org_path.joinpath('content_creation.csv'))
                    result = {
                        'no_of_textbooks': dce_df.shape[0],
                        'no_of_qr_codes': int(dce_df['Total number of QR codes'].sum()),
                        'no_of_resource': cc_df[cc_df['Status'] == 'live']['Status from the beginning'].values.tolist()[0]
                    }
                except:
                    result = {
                        'no_of_textbooks': 0,
                        'no_of_qr_codes': 0,
                        'no_of_resource': 0
                    }
                with open(str(org_path.joinpath('landing_page_creation_metrics.json')), 'w') as f:
                    json.dump(result, f)

                post_data_to_blob(org_path.joinpath('landing_page_2018.json'))
                post_data_to_blob(org_path.joinpath('landing_page_2019.json'))
                post_data_to_blob(org_path.joinpath('landing_page_creation_metrics.json'))
            except EmptyDataError:
                pass
            except AzureMissingResourceHttpError:
                pass
    def app_and_plays(self, result_loc_, date_):
        """
        Compute App Sessions and content play sessions and time spent on content consumption.
        :param result_loc_: pathlib.Path object to store resultant CSV at.
        :param date_: datetime object to use in query and path
        :return: None
        """
        # Overall app session metrics
        end_date = date_ + timedelta(days=1)
        query = Template(app_sessions_devices.init())
        query = query.substitute(
            app=self.config['context']['pdata']['id']['app'],
            start_date=datetime.strftime(date_, '%Y-%m-%dT00:00:00+00:00'),
            end_date=datetime.strftime(end_date, '%Y-%m-%dT00:00:00+00:00'))
        headers = {'Content-Type': "application/json"}
        url = "{}druid/v2/".format(self.druid_hostname)
        response = requests.request("POST", url, data=query, headers=headers)
        records = [events['event'] for events in response.json()]
        app_df = pd.DataFrame(records)
        app_df['Total Devices on App'] = app_df['Total Devices on App'].astype(
            int)
        app_df['Total Time on App (in hours)'] = app_df[
            'Total Time on App'] / 3600
        app_df.drop(['Total Time on App'], axis=1, inplace=True)
        result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
        app_df.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                           'app_sessions.csv'),
                      index=False)
        post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                               'app_sessions.csv'),
                          backup=True)

        # Content Play and time spent
        query = Template(app_plays.init())
        query = query.substitute(
            app=self.config['context']['pdata']['id']['app'],
            portal=self.config['context']['pdata']['id']['portal'],
            start_date=datetime.strftime(date_, '%Y-%m-%dT00:00:00+00:00'),
            end_date=datetime.strftime(end_date, '%Y-%m-%dT00:00:00+00:00'))
        headers = {'Content-Type': "application/json"}
        url = "{}druid/v2/".format(self.druid_hostname)
        response = requests.request("POST", url, data=query, headers=headers)
        records = [events['event'] for events in response.json()]
        play_df = pd.DataFrame(records)

        content = pd.read_csv(
            str(
                result_loc_.parent.joinpath('tb_metadata',
                                            date_.strftime('%Y-%m-%d'),
                                            'textbook_snapshot.csv')))
        content = content[['identifier', 'channel']]
        content.drop_duplicates(inplace=True)
        content.rename(columns={'identifier': 'object_rollup_l1'},
                       inplace=True)
        play_df = play_df.merge(content, on="object_rollup_l1", how="left")
        play_df.rename(columns={
            'dimensions_pdata_id': 'pdata_id',
            'dimensions_did': 'Total Devices that played content'
        },
                       inplace=True)
        play_df = play_df.groupby(['channel', 'pdata_id']).agg({
            'Total Devices that played content':
            pd.Series.nunique,
            'Total Content Plays':
            pd.Series.sum,
            'Content Play Time':
            pd.Series.sum
        })
        play_df.reset_index(inplace=True)
        play_df['Content Play Time (in hours)'] = play_df[
            'Content Play Time'] / 3600
        play_df.drop(['Content Play Time'], axis=1, inplace=True)
        x_play = play_df.pivot(index='channel', columns='pdata_id')
        x_play.to_csv(
            result_loc_.joinpath(date_.strftime('%Y-%m-%d'), 'plays.csv'))
        post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                               'plays.csv'),
                          backup=True)
 def merge_metrics(self, result_loc_, date_):
     """
     merge all the metrics
     :param result_loc_: pathlib.Path object to store resultant CSV at.
     :param date_: datetime object to be used in path
     :return: None
     """
     slug_ = result_loc_.name
     result_loc_.parent.parent.parent.joinpath("portal_dashboards").mkdir(
         exist_ok=True)
     last_sunday = datetime.strftime(date_ - timedelta(days=1), '%d/%m/%Y')
     try:
         devices_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_unique_devices.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         devices_df = pd.DataFrame([],
                                   columns=[
                                       'District', 'Platform',
                                       'Unique Devices'
                                   ]).set_index(['District', 'Platform'])
     try:
         plays_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_content_plays.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         plays_df = pd.DataFrame([],
                                 columns=[
                                     'District', 'Platform',
                                     'Number of Content Plays'
                                 ]).set_index(['District', 'Platform'])
     try:
         scans_df = pd.read_csv(
             result_loc_.joinpath(
                 "aggregated_district_qr_scans.csv")).set_index(
                     ['District', 'Platform'])
     except FileNotFoundError:
         scans_df = pd.DataFrame([],
                                 columns=[
                                     'District', 'Platform',
                                     'Number of QR Scans'
                                 ]).set_index(['District', 'Platform'])
     district_df = devices_df.join(scans_df, how='outer').join(
         plays_df, how='outer').reset_index().pivot(index='District',
                                                    columns='Platform')
     district_df = district_df.join(district_df.sum(level=0, axis=1))
     district_df.columns = [
         col[0] + ' on ' +
         col[1].split('.')[-1] if isinstance(col, tuple) else 'Total ' + col
         for col in district_df.columns
     ]
     district_df['Data as on Last day (Sunday) of the week'] = last_sunday
     district_df = district_df.reset_index()
     district_df.index = [
         pd.to_datetime(
             district_df['Data as on Last day (Sunday) of the week'],
             format='%d/%m/%Y'), district_df['District']
     ]
     for c in [
             'Unique Devices on portal', 'Unique Devices on app',
             'Total Unique Devices', 'Number of QR Scans on portal',
             'Number of QR Scans on app', 'Total Number of QR Scans',
             'Number of Content Plays on portal',
             'Number of Content Plays on app',
             'Total Number of Content Plays'
     ]:
         if c not in district_df.columns:
             district_df[c] = 0
     try:
         get_data_from_blob(
             result_loc_.parent.parent.parent.joinpath(
                 "portal_dashboards", slug_,
                 "aggregated_district_data.csv"))
         blob_data = pd.read_csv(
             result_loc_.parent.parent.parent.joinpath(
                 "portal_dashboards", slug_,
                 "aggregated_district_data.csv"))
         blob_data = blob_data[
             blob_data['Data as on Last day (Sunday) of the week'] !=
             last_sunday]
         blob_data.index = [
             pd.to_datetime(
                 blob_data['Data as on Last day (Sunday) of the week'],
                 format='%d/%m/%Y'), blob_data['District']
         ]
     except AzureMissingResourceHttpError:
         blob_data = pd.DataFrame()
     except FileNotFoundError:
         blob_data = pd.DataFrame()
     district_df = pd.concat([blob_data, district_df], sort=True)
     district_df = district_df.sort_index().drop_duplicates(
         subset=['Data as on Last day (Sunday) of the week', 'District'],
         keep='last').fillna(0)
     district_df = district_df[[
         'Data as on Last day (Sunday) of the week', 'District',
         'Unique Devices on app', 'Unique Devices on portal',
         'Total Unique Devices', 'Number of QR Scans on app',
         'Number of QR Scans on portal', 'Total Number of QR Scans',
         'Number of Content Plays on app',
         'Number of Content Plays on portal',
         'Total Number of Content Plays'
     ]]
     district_df.to_csv(result_loc_.parent.parent.parent.joinpath(
         "portal_dashboards", slug_, "aggregated_district_data.csv"),
                        index=False)
     create_json(
         result_loc_.parent.parent.parent.joinpath(
             "portal_dashboards", slug_, "aggregated_district_data.csv"))
     post_data_to_blob(
         result_loc_.parent.parent.parent.joinpath(
             "portal_dashboards", slug_, "aggregated_district_data.csv"))
 def get_overall_report(result_loc_, druid_rollup_, date_, config):
     """
     Query Druid Rollup monthwise for content and platform level play session counts and time_spent.
     :param result_loc_: pathlib.Path() object to store resultant data at
     :param druid_rollup_: Druid broker ip and port for rollup data
     :param date_: execution date for report
     :param config: diksha configurables
     :return: None
     """
     tenant_info = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'tenant_info.csv'))[['id', 'slug']]
     tenant_info['id'] = tenant_info['id'].astype(str)
     tenant_info.set_index('id', inplace=True)
     content_model = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_model_snapshot.csv'))
     content_model['channel'] = content_model['channel'].astype(str)
     # content_model['mimeType'] = content_model['mimeType'].apply(mime_type)
     content_model = content_model[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'contentType', 'identifier', 'name', 'creator', 'mimeType',
         'createdOn', 'lastPublishedOn', 'tb_id', 'tb_name',
         'me_totalRatings', 'me_averageRating'
     ]]
     content_model.set_index('identifier', inplace=True)
     result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
     start_date = datetime(2019, 6, 1)
     while start_date < date_:
         if datetime(start_date.year + int(start_date.month / 12),
                     (start_date.month % 12) + 1, 1) < date_:
             end_date = datetime(
                 start_date.year + int(start_date.month / 12),
                 (start_date.month % 12) + 1, 1)
         else:
             end_date = date_
         get_content_plays(result_loc_=result_loc_.joinpath(
             date_.strftime('%Y-%m-%d')),
                           start_date_=start_date,
                           end_date_=end_date,
                           druid_=druid_rollup_,
                           config_=config,
                           version_='v1')
         start_date = end_date
     spark = SparkSession.builder.appName('content_consumption').master(
         "local[*]").getOrCreate()
     content_plays = spark.read.csv(str(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_plays_*.csv')),
                                    header=True)
     content_plays = content_plays.groupby(
         fn.col('object_id'), fn.col('dimensions_pdata_id')).agg(
             fn.sum('Number of plays').alias('Number of plays'),
             fn.sum('Total time spent').alias(
                 'Total time spent')).toPandas()
     spark.stop()
     content_plays = content_plays.pivot(
         index='object_id',
         columns='dimensions_pdata_id',
         values=['Number of plays', 'Total time spent'])
     col_list = []
     for i in content_plays.columns:
         col_list.append(i[0] + ' on ' + i[1].split('.')[-1].title())
     content_plays.columns = col_list
     content_plays.fillna(0, inplace=True)
     content_plays['Total No of Plays (App and Portal)'] = content_plays[
         'Number of plays on App'] + content_plays[
             'Number of plays on Portal']
     content_plays['Average Play Time in mins on App'] = round(
         content_plays['Total time spent on App'] /
         (content_plays['Number of plays on App'] * 60), 2)
     content_plays['Average Play Time in mins on Portal'] = round(
         content_plays['Total time spent on Portal'] /
         (content_plays['Number of plays on Portal'] * 60), 2)
     content_plays['Average Play Time in mins (On App and Portal)'] = round(
         (content_plays['Total time spent on App'] +
          content_plays['Total time spent on Portal']) /
         ((content_plays['Number of plays on App'] +
           content_plays['Number of plays on Portal']) * 60), 2)
     content_plays.drop(
         ['Total time spent on App', 'Total time spent on Portal'],
         axis=1,
         inplace=True)
     overall = content_model.join(content_plays).reset_index()
     overall = overall[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'identifier', 'name', 'mimeType', 'createdOn', 'creator',
         'lastPublishedOn', 'tb_id', 'tb_name', 'me_averageRating',
         'me_totalRatings', 'Number of plays on App',
         'Number of plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]]
     overall.columns = [
         'channel', 'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
         'Content Name', 'Mime Type', 'Created On', 'Creator (User Name)',
         'Last Published On', 'Linked Textbook Id(s)',
         'Linked Textbook Name(s)', 'Average Rating(out of 5)',
         'Total No of Ratings', 'Number of Plays on App',
         'Number of Plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]
     overall['Content ID'] = overall['Content ID'].str.replace('.img', '')
     overall['Created On'] = overall['Created On'].fillna('T').apply(
         lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     overall['Last Published On'] = overall['Last Published On'].fillna(
         'T').apply(lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     overall.fillna(
         {
             'Board': 'Unknown',
             'Medium': 'Unknown',
             'Grade': 'Unknown',
             'Subject': 'Unknown',
             'Creator (User Name)': '',
             'Linked Textbook Id(s)': '',
             'Linked Textbook Name(s)': '',
             'Number of Plays on App': 0,
             'Number of Plays on Portal': 0,
             'Total No of Plays (App and Portal)': 0,
             'Average Play Time in mins on App': 0,
             'Average Play Time in mins on Portal': 0,
             'Average Play Time in mins (On App and Portal)': 0
         },
         inplace=True)
     overall.sort_values(inplace=True,
                         ascending=[1, 1, 1, 1, 1, 0],
                         by=[
                             'channel', 'Board', 'Medium', 'Grade',
                             'Subject', 'Total No of Plays (App and Portal)'
                         ])
     for channel in overall.channel.unique():
         try:
             slug = tenant_info.loc[channel]['slug']
         except KeyError:
             continue
         content_aggregates = overall[overall['channel'] == channel]
         content_aggregates.drop(['channel'], axis=1, inplace=True)
         result_loc_.parent.joinpath('portal_dashboards',
                                     slug).mkdir(exist_ok=True)
         content_aggregates.to_csv(result_loc_.parent.joinpath(
             'portal_dashboards', slug, 'content_aggregated.csv'),
                                   index=False,
                                   encoding='utf-8-sig')
         create_json(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregated.csv'))
         post_data_to_blob(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregated.csv'))
예제 #16
0
 def get_weekly_plays(self, result_loc_, date_, cassandra_, keyspace_):
     """
     query cassandra table for 1 week of content play and timespent.
     :param result_loc_: local path to store resultant csv
     :param date_: datetime object to pass to file path
     :param cassandra_: ip of the cassandra cluster
     :param keyspace_: keyspace in which we are working
     :return: None
     """
     tenant_info = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'tenant_info.csv'))[['id', 'slug']]
     tenant_info['id'] = tenant_info['id'].astype(str)
     tenant_info.set_index('id', inplace=True)
     cluster = Cluster([cassandra_])
     session = cluster.connect()
     start_date = date_ - timedelta(days=7)
     fetch_query = Template("""
     SELECT content_id, period, pdata_id, metric FROM $keyspace.content_aggregates WHERE 
     period >= $start_date AND 
     period < $end_date
     ALLOW FILTERING
     """)
     result = session.execute(
         fetch_query.substitute(keyspace=keyspace_,
                                start_date=start_date.strftime('%Y%m%d'),
                                end_date=date_.strftime('%Y%m%d')))
     df_dict = {}
     for row in result:
         if row.content_id in df_dict.keys():
             pass
         else:
             df_dict[row.content_id] = {
                 'identifier': row.content_id,
                 'Number of Plays on App': 0,
                 'Number of Plays on Portal': 0,
                 'Timespent on App': 0,
                 'Timespent on Portal': 0
             }
         pdata_id = 'App' if row.pdata_id == self.config['context']['pdata']['id']['app'] else 'Portal' if \
             row.pdata_id == self.config['context']['pdata']['id']['portal'] else 'error'
         df_dict[row.content_id]['Number of Plays on ' +
                                 pdata_id] += row.metric['plays']
         df_dict[row.content_id]['Timespent on ' +
                                 pdata_id] = row.metric['timespent']
     temp = []
     for k, v in df_dict.items():
         temp.append(v)
     df = pd.DataFrame(temp)
     df['Total No of Plays (App and Portal)'] = df[
         'Number of Plays on App'] + df['Number of Plays on Portal']
     df['Average Play Time in mins on App'] = round(
         df['Timespent on App'] / (60 * df['Number of Plays on App']), 2)
     df['Average Play Time in mins on Portal'] = round(
         df['Timespent on Portal'] / (60 * df['Number of Plays on Portal']),
         2)
     df['Average Play Time in mins (On App and Portal)'] = round(
         (df['Timespent on App'] + df['Timespent on Portal']) /
         (60 * df['Total No of Plays (App and Portal)']), 2)
     df = df[[
         'identifier', 'Total No of Plays (App and Portal)',
         'Number of Plays on App', 'Number of Plays on Portal',
         'Average Play Time in mins (On App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal'
     ]]
     content_model = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_model_snapshot.csv'))[[
                                  'channel', 'board', 'medium',
                                  'gradeLevel', 'subject', 'identifier',
                                  'name', 'mimeType', 'createdOn',
                                  'creator', 'lastPublishedOn',
                                  'me_averageRating'
                              ]]
     content_model["creator"] = content_model["creator"].str.replace(
         "null", "")
     content_model['channel'] = content_model['channel'].astype(str)
     content_model['mimeType'] = content_model['mimeType'].apply(
         self.mime_type)
     content_model.columns = [
         'channel', 'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
         'Content Name', 'Mime Type', 'Created On', 'Creator (User Name)',
         'Last Published On', 'Average Rating(out of 5)'
     ]
     content_model['Content ID'] = content_model['Content ID'].str.replace(
         ".img", "")
     content_model['Created On'] = content_model['Created On'].fillna(
         'T').apply(lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     content_model['Last Published On'] = content_model[
         'Last Published On'].fillna('T').apply(
             lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     # content_model['Last Updated On'] = content_model['Last Updated On'].fillna('T').apply(
     #     lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     df = content_model.join(df.set_index('identifier'),
                             on='Content ID',
                             how='left')
     df['Last Date of the week'] = (date_ -
                                    timedelta(days=1)).strftime('%d-%m-%Y')
     df['Total No of Plays (App and Portal)'] = df[
         'Total No of Plays (App and Portal)'].fillna(0)
     df['Number of Plays on App'] = df['Number of Plays on App'].fillna(0)
     df['Number of Plays on Portal'] = df[
         'Number of Plays on Portal'].fillna(0)
     df['Average Play Time in mins (On App and Portal)'] = df[
         'Average Play Time in mins (On App and Portal)'].fillna(0)
     df['Average Play Time in mins on App'] = df[
         'Average Play Time in mins on App'].fillna(0)
     df['Average Play Time in mins on Portal'] = df[
         'Average Play Time in mins on Portal'].fillna(0)
     df = df.fillna('Unknown')
     df.sort_values(inplace=True,
                    ascending=[1, 1, 1, 1, 1, 0],
                    by=[
                        'channel', 'Board', 'Medium', 'Grade', 'Subject',
                        'Total No of Plays (App and Portal)'
                    ])
     df.to_csv(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                    'weekly_plays.csv'),
               index=False)
     post_data_to_blob(result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                            'weekly_plays.csv'),
                       backup=True)
     for channel in df.channel.unique():
         try:
             slug = tenant_info.loc[channel]['slug']
             print(slug)
         except KeyError:
             continue
         content_aggregates = df[df['channel'] == channel]
         content_aggregates.drop(['channel'], axis=1, inplace=True)
         try:
             get_data_from_blob(
                 result_loc_.parent.joinpath('portal_dashboards', slug,
                                             'content_aggregates.csv'))
             blob_data = pd.read_csv(
                 result_loc_.parent.joinpath('portal_dashboards', slug,
                                             'content_aggregates.csv'))
         except AzureMissingResourceHttpError:
             blob_data = pd.DataFrame()
         except FileNotFoundError:
             blob_data = pd.DataFrame()
         content_aggregates = content_aggregates.append(
             blob_data).drop_duplicates(
                 subset=['Content ID', 'Last Date of the week'],
                 keep='first')
         content_aggregates = content_aggregates[[
             'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
             'Content Name', 'Mime Type', 'Created On',
             'Creator (User Name)', 'Last Published On',
             'Total No of Plays (App and Portal)', 'Number of Plays on App',
             'Number of Plays on Portal',
             'Average Play Time in mins (On App and Portal)',
             'Average Play Time in mins on App',
             'Average Play Time in mins on Portal',
             'Average Rating(out of 5)', 'Last Date of the week'
         ]]
         result_loc_.parent.joinpath('portal_dashboards',
                                     slug).mkdir(exist_ok=True)
         content_aggregates.to_csv(result_loc_.parent.joinpath(
             'portal_dashboards', slug, 'content_aggregates.csv'),
                                   index=False,
                                   encoding='utf-8-sig')
         create_json(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregates.csv'))
         post_data_to_blob(
             result_loc_.parent.joinpath('portal_dashboards', slug,
                                         'content_aggregates.csv'))
 def get_weekly_report(result_loc_, druid_rollup_, date_, config):
     """
     Query druid rollups for weekly content and platform level play session counts and time_spent.
     :param result_loc_: pathlib.Path() object to store resultant CSVs at
     :param druid_rollup_: druid broker ip and port for rollup data
     :param date_: execution date for report
     :param config: diksha configurables
     :return: None
     """
     tenant_info = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'tenant_info.csv'))[['id', 'slug']]
     tenant_info['id'] = tenant_info['id'].astype(str)
     tenant_info.set_index('id', inplace=True)
     content_model = pd.read_csv(
         result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                              'content_model_snapshot.csv'))
     content_model['channel'] = content_model['channel'].astype(str)
     # content_model['mimeType'] = content_model['mimeType'].apply(mime_type)
     content_model = content_model[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'contentType', 'identifier', 'name', 'creator', 'mimeType',
         'createdOn', 'lastPublishedOn', 'tb_id', 'tb_name',
         'me_totalRatings', 'me_averageRating'
     ]]
     content_model.set_index('identifier', inplace=True)
     result_loc_.joinpath(date_.strftime('%Y-%m-%d')).mkdir(exist_ok=True)
     start_date = date_ - timedelta(days=7)
     get_content_plays(result_loc_=result_loc_.joinpath(
         date_.strftime('%Y-%m-%d')),
                       start_date_=start_date,
                       end_date_=date_,
                       druid_=druid_rollup_,
                       config_=config,
                       version_='v2')
     content_plays = pd.read_csv(
         result_loc_.joinpath(
             date_.strftime('%Y-%m-%d'),
             'content_plays_{}.csv'.format(date_.strftime('%Y-%m-%d'))))
     content_plays = content_plays.groupby([
         'object_id', 'dimensions_pdata_id'
     ])[['Number of plays', 'Total time spent']].sum().reset_index()
     content_plays = content_plays.pivot(
         index='object_id',
         columns='dimensions_pdata_id',
         values=['Number of plays', 'Total time spent'])
     col_list = []
     for i in content_plays.columns:
         col_list.append(i[0] + ' on ' + i[1].split('.')[-1].title())
     content_plays.columns = col_list
     content_plays.fillna(0, inplace=True)
     content_plays['Total No of Plays (App and Portal)'] = content_plays[
         'Number of plays on App'] + content_plays[
             'Number of plays on Portal']
     content_plays['Average Play Time in mins on App'] = round(
         content_plays['Total time spent on App'] /
         (content_plays['Number of plays on App'] * 60), 2)
     content_plays['Average Play Time in mins on Portal'] = round(
         content_plays['Total time spent on Portal'] /
         (content_plays['Number of plays on Portal'] * 60), 2)
     content_plays['Average Play Time in mins (On App and Portal)'] = round(
         (content_plays['Total time spent on App'] +
          content_plays['Total time spent on Portal']) /
         ((content_plays['Number of plays on App'] +
           content_plays['Number of plays on Portal']) * 60), 2)
     content_plays.drop(
         ['Total time spent on App', 'Total time spent on Portal'],
         axis=1,
         inplace=True)
     weekly = content_model.join(content_plays).reset_index()
     weekly = weekly[[
         'channel', 'board', 'medium', 'gradeLevel', 'subject',
         'identifier', 'name', 'mimeType', 'createdOn', 'creator',
         'lastPublishedOn', 'tb_id', 'tb_name', 'me_averageRating',
         'me_totalRatings', 'Number of plays on App',
         'Number of plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]]
     weekly.columns = [
         'channel', 'Board', 'Medium', 'Grade', 'Subject', 'Content ID',
         'Content Name', 'Mime Type', 'Created On', 'Creator (User Name)',
         'Last Published On', 'Linked Textbook Id(s)',
         'Linked Textbook Name(s)', 'Average Rating(out of 5)',
         'Total No of Ratings', 'Number of Plays on App',
         'Number of Plays on Portal', 'Total No of Plays (App and Portal)',
         'Average Play Time in mins on App',
         'Average Play Time in mins on Portal',
         'Average Play Time in mins (On App and Portal)'
     ]
     weekly['Content ID'] = weekly['Content ID'].str.replace('.img', '')
     weekly['Created On'] = weekly['Created On'].fillna('T').apply(
         lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     weekly['Last Published On'] = weekly['Last Published On'].fillna(
         'T').apply(lambda x: '-'.join(x.split('T')[0].split('-')[::-1]))
     weekly.fillna(
         {
             'Board': 'Unknown',
             'Medium': 'Unknown',
             'Grade': 'Unknown',
             'Subject': 'Unknown',
             'Creator (User Name)': '',
             'Linked Textbook Id(s)': '',
             'Linked Textbook Name(s)': '',
             'Number of Plays on App': 0,
             'Number of Plays on Portal': 0,
             'Total No of Plays (App and Portal)': 0,
             'Average Play Time in mins on App': 0,
             'Average Play Time in mins on Portal': 0,
             'Average Play Time in mins (On App and Portal)': 0
         },
         inplace=True)
     weekly.sort_values(inplace=True,
                        ascending=[1, 1, 1, 1, 1, 0],
                        by=[
                            'channel', 'Board', 'Medium', 'Grade',
                            'Subject', 'Total No of Plays (App and Portal)'
                        ])
     weekly['Last Date of the week'] = date_.strftime('%d-%m-%Y')
     for channel in weekly.channel.unique():
         try:
             slug = tenant_info.loc[channel]['slug']
         except KeyError:
             continue
         content_aggregates = weekly[weekly['channel'] == channel]
         content_aggregates.drop(['channel'], axis=1, inplace=True)
         result_loc_.parent.joinpath('portal_dashboards',
                                     slug).mkdir(exist_ok=True)
         content_aggregates.to_csv(result_loc_.parent.joinpath(
             'portal_dashboards', slug, 'content_consumption_lastweek.csv'),
                                   index=False,
                                   encoding='utf-8-sig')
         create_json(
             result_loc_.parent.joinpath(
                 'portal_dashboards', slug,
                 'content_consumption_lastweek.csv'))
         post_data_to_blob(
             result_loc_.parent.joinpath(
                 'portal_dashboards', slug,
                 'content_consumption_lastweek.csv'))
         content_aggregates.to_csv(result_loc_.joinpath(
             date_.strftime('%Y-%m-%d'),
             'content_consumption_lastweek_{}.csv'.format(slug)),
                                   index=False,
                                   encoding='utf-8-sig')
         post_data_to_blob(result_loc_.joinpath(
             date_.strftime('%Y-%m-%d'),
             'content_consumption_lastweek_{}.csv'.format(slug)),
                           backup=True)
 def etb_aggregates(self, result_loc_, slug, df):
     """
     generate charts from ETB data
     :param result_loc_: pathlib.Path object for resultant CSVs.
     :param slug: slug for the channel
     :param df: ETB textbook dataframe for channel
     :return: None
     """
     textbook_status = pd.DataFrame(
         df['Textbook Status'].value_counts()).reindex(
             ['Live', 'Review', 'Draft']).reset_index().fillna(0)
     textbook_status.columns = ['Status', 'Count']
     textbook_status.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                                 'etb_textbook_status.csv'),
                            index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status.csv'))
     textbook_status_grade = pd.DataFrame(
         df.groupby(['Grade', 'Textbook Status'
                     ])['Textbook ID'].count()).reset_index().pivot(
                         index='Grade',
                         columns='Textbook Status').fillna(0).reset_index()
     columns = ['Grade']
     for column in textbook_status_grade.columns[1:]:
         columns.append(column[1])
     textbook_status_grade.columns = columns
     textbook_status_grade = self.grade_fix(textbook_status_grade)
     statuses = ['Live', 'Review', 'Draft']
     column_order = ['Class']
     for status in statuses:
         if status not in textbook_status_grade.columns:
             textbook_status_grade[status] = 0
     textbook_status_grade = textbook_status_grade[column_order + statuses]
     textbook_status_grade.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_textbook_status_grade.csv'),
                                  index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_grade.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_grade.csv'))
     textbook_status_subject = pd.DataFrame(
         df.groupby(['Subject', 'Textbook Status'
                     ])['Textbook ID'].count()).reset_index().pivot(
                         index='Subject',
                         columns='Textbook Status').fillna(0).reset_index()
     columns = ['Subject']
     for column in textbook_status_subject.columns[1:]:
         columns.append(column[1])
     textbook_status_subject.columns = columns
     column_order = ['Subject']
     for status in statuses:
         if status not in textbook_status_subject.columns:
             textbook_status_subject[status] = 0
     textbook_status_subject = textbook_status_subject[column_order +
                                                       statuses]
     textbook_status_subject.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_textbook_status_subject.csv'),
                                    index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_subject.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_textbook_status_subject.csv'))
     qr_counts = pd.DataFrame(
         df.groupby([
             'channel', 'With QR codes'
         ])['Textbook ID'].count()).reset_index().drop('channel', axis=1)
     qr_counts.columns = ['Status', 'Count']
     qr_counts.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                           'etb_qr_count.csv'),
                      index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_count.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_count.csv'))
     qr_linkage = df[[
         'Total QR codes linked to content',
         'Total number of QR codes with no linked content'
     ]].sum()
     qr_linkage.index = ['QR Code With Content', 'QR Code Without Content']
     qr_linkage = pd.DataFrame(qr_linkage).reset_index()
     qr_linkage.columns = ['Status', 'Count']
     qr_linkage.to_csv(result_loc_.joinpath('portal_dashboards', slug,
                                            'etb_qr_content_status.csv'),
                       index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status.csv'))
     qr_linkage_grade = df.groupby('Grade')[[
         'Total QR codes linked to content',
         'Total number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linkage_grade.columns = [
         'Grade', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linkage_grade = self.grade_fix(qr_linkage_grade)
     qr_linkage_grade.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_qr_content_status_grade.csv'),
                             index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_grade.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_grade.csv'))
     qr_linkage_subject = df.groupby('Subject')[[
         'Total QR codes linked to content',
         'Total number of QR codes with no linked content'
     ]].sum().reset_index()
     qr_linkage_subject.columns = [
         'Class', 'QR Codes with content', 'QR Codes without content'
     ]
     qr_linkage_subject.to_csv(result_loc_.joinpath(
         'portal_dashboards', slug, 'etb_qr_content_status_subject.csv'),
                               index=False)
     create_json(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_subject.csv'))
     post_data_to_blob(
         result_loc_.joinpath('portal_dashboards', slug,
                              'etb_qr_content_status_subject.csv'))
예제 #19
0
    def generate_report(self, result_loc_, date_):
        board_slug = pd.read_csv(
            result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                 'tenant_info.csv'))[['id', 'slug']]

        df = pd.read_csv(
            result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                 'content_model_snapshot.csv'))[[
                                     'board', 'medium', 'gradeLevel',
                                     'subject', 'identifier', 'name', 'status',
                                     'createdOn', 'creator', 'lastPublishedOn',
                                     'lastUpdatedOn', 'channel',
                                     'lastSubmittedOn'
                                 ]]

        if 'createdOn' not in df.columns:
            df['createdOn'] = 'T'
        if 'lastSubmittedOn' not in df.columns:
            df['lastSubmittedOn'] = 'T'
        if 'lastPublishedOn' not in df.columns:
            df['lastPublishedOn'] = 'T'

        df['createdOn'] = df['createdOn'].fillna('T').apply(self.date_format)

        review = df[df['status'] == 'Review']
        review['lastSubmittedOn'] = review['lastSubmittedOn'].fillna(
            'T').apply(self.date_format)
        review.rename(
            columns={'lastSubmittedOn': 'Pending in current status since'},
            inplace=True)

        only_draft = df[(df['status'] == 'Draft')
                        & (df['lastPublishedOn'].isna())]
        only_draft.loc[:,
                       'Pending in current status since'] = only_draft.loc[:,
                                                                           'createdOn']

        published = df[(df['status'] == 'Unlisted') | \
                       (df['status'] == 'Live') | \
                       ((df['status'] == 'Draft') & (df['lastPublishedOn'].notna()))]
        published['status'] = pd.np.where(published['status'] == 'Unlisted',
                                          'Limited Sharing',
                                          published['status'])
        published['lastPublishedOn'] = published['lastPublishedOn'].fillna(
            'T').apply(self.date_format)
        published.rename(
            columns={'lastPublishedOn': 'Pending in current status since'},
            inplace=True)

        result_df = pd.concat([review, only_draft, published])
        result_df['gradeSort'] = result_df['gradeLevel'].apply(self.grade_sort)

        result_df = result_df.sort_values(
            by=['board', 'medium', 'gradeSort', 'subject', 'name'],
            ascending=[False, True, True, True, True])

        result_df = result_df.fillna('Unknown')

        result_df = result_df[[
            'board', 'medium', 'gradeLevel', 'subject', 'identifier', 'name',
            'status', 'createdOn', 'Pending in current status since',
            'creator', 'channel'
        ]]

        result_df.to_csv(result_loc_.joinpath(
            date_.strftime('%Y-%m-%d'), 'Content_Creation_Status_Overall.csv'),
                         index=False,
                         encoding='utf-8')
        # create_json(result_loc_.joinpath('Content_Creation_Status.csv'))
        post_data_to_blob(
            result_loc_.joinpath(date_.strftime('%Y-%m-%d'),
                                 'Content_Creation_Status_Overall.csv'), True)

        for index, bs_value in board_slug.iterrows():
            channel, slug = bs_value.values
            print(slug)

            channel_df = result_df[result_df['channel'] == channel]
            channel_df = channel_df[[
                'board', 'medium', 'gradeLevel', 'subject', 'identifier',
                'name', 'status', 'createdOn',
                'Pending in current status since', 'creator'
            ]]
            channel_df.columns = [
                'Board', 'Medium', 'Grade', 'Subject', 'Content Id',
                'Content name', 'Status', 'Created On',
                'Pending in current status since', 'Created By'
            ]

            os.makedirs(result_loc_.joinpath(slug), exist_ok=True)
            channel_df.to_csv(result_loc_.joinpath(
                slug, 'Content_Creation_Status.csv'),
                              index=False,
                              encoding='utf-8')
            create_json(
                result_loc_.joinpath(slug, 'Content_Creation_Status.csv'))
            post_data_to_blob(
                result_loc_.joinpath(slug, 'Content_Creation_Status.csv'))
    def daily_metrics(self, read_loc_, date_):
        """
        merge the three metrics
        :param read_loc_: pathlib.Path object to read CSV from.
        :param date_: datetime object to use in path
        :return: None
        """
        try:
            board_slug = \
                pd.read_csv(
                    self.data_store_location.joinpath('textbook_reports', date_.strftime('%Y-%m-%d'), 'tenant_info.csv'))[
                    ['id', 'slug']]
            board_slug.set_index('id', inplace=True)
        except Exception:
            raise Exception('Board Slug Error!')
        try:
            scans_df = pd.read_csv(
                read_loc_.joinpath('dialcode_scans',
                                   date_.strftime('%Y-%m-%d'),
                                   'dial_scans.csv')).fillna('')
            scans_df = scans_df.pivot(index='dialcode_channel',
                                      columns='failed_flag',
                                      values='count').reset_index().fillna(0)
            scans_df = scans_df.join(
                board_slug, on='dialcode_channel',
                how='left')[['slug', 'Failed QR Scans', 'Successful QR Scans']]
            scans_df['Total QR scans'] = scans_df[
                'Successful QR Scans'] + scans_df['Failed QR Scans']
            scans_df['Percentage (%) of Failed QR Scans'] = scans_df[
                'Failed QR Scans'] * 100 / scans_df['Total QR scans']
            unmapped = scans_df[scans_df.slug.isna()]['Total QR scans'][0]
            scans_df.dropna(subset=['slug'], inplace=True)
        except Exception as e:
            raise Exception('Scans Error! :: {}'.format(str(e)))
        try:
            downloads_df = pd.read_csv(
                read_loc_.joinpath('downloads', date_.strftime('%Y-%m-%d'),
                                   'downloads.csv'))
            downloads_df = downloads_df.fillna('').join(
                board_slug, on='channel',
                how='left')[['count', 'slug']].dropna(subset=['slug'])
            downloads_df.columns = ['Total Content Downloads', 'slug']
        except Exception:
            raise Exception('Downloads Error!')
        try:
            app_df = pd.read_csv(
                read_loc_.joinpath('play', date_.strftime('%Y-%m-%d'),
                                   'app_sessions.csv'))
            app_df = app_df[[
                'Total App Sessions', 'Total Devices on App',
                'Total Time on App (in hours)'
            ]]
            plays_df = pd.read_csv(read_loc_.joinpath(
                'play', date_.strftime('%Y-%m-%d'), 'plays.csv'),
                                   header=[0, 1],
                                   dtype={0: str})

            # Making the channel column as index with string type since the csv is in multiindex format
            plays_df.set_index(plays_df.columns[0], inplace=True)
            plays_df.index.names = ['channel']
            plays_df = plays_df[1:]

            plays_df = plays_df.reset_index().join(board_slug,
                                                   on='channel',
                                                   how='left')
            plays_df['Total Content Plays on App'] = plays_df.get(
                ('Total Content Plays',
                 self.config['context']['pdata']['id']['app']),
                pd.Series(index=plays_df.index,
                          name=('Total Content Plays',
                                self.config['context']['pdata']['id']['app'])))
            plays_df['Total Content Plays on Portal'] = plays_df.get(
                ('Total Content Plays',
                 self.config['context']['pdata']['id']['portal']),
                pd.Series(
                    index=plays_df.index,
                    name=('Total Content Plays',
                          self.config['context']['pdata']['id']['portal'])))
            plays_df[
                'Total Devices that played content on App'] = plays_df.get(
                    ('Total Devices that played content',
                     self.config['context']['pdata']['id']['app']),
                    pd.Series(
                        index=plays_df.index,
                        name=('Total Devices that played content',
                              self.config['context']['pdata']['id']['app'])))
            plays_df[
                'Total Devices that played content on Portal'] = plays_df.get(
                    ('Total Devices that played content',
                     self.config['context']['pdata']['id']['portal']),
                    pd.Series(
                        index=plays_df.index,
                        name=(
                            'Total Devices that played content',
                            self.config['context']['pdata']['id']['portal'])))
            plays_df['Content Play Time on App (in hours)'] = plays_df.get(
                ('Content Play Time (in hours)',
                 self.config['context']['pdata']['id']['app']),
                pd.Series(index=plays_df.index,
                          name=('Content Play Time (in hours)',
                                self.config['context']['pdata']['id']['app'])))
            plays_df['Content Play Time on Portal (in hours)'] = plays_df.get(
                ('Content Play Time (in hours)',
                 self.config['context']['pdata']['id']['portal']),
                pd.Series(
                    index=plays_df.index,
                    name=('Content Play Time (in hours)',
                          self.config['context']['pdata']['id']['portal'])))
            plays_df = plays_df[[
                'Total Content Plays on App', 'Total Content Plays on Portal',
                'Total Devices that played content on App',
                'Total Devices that played content on Portal',
                'Content Play Time on App (in hours)',
                'Content Play Time on Portal (in hours)', 'slug'
            ]].dropna(subset=['slug'])
        except Exception as e:
            raise Exception('App and Plays Error! :: {}'.format(str(e)))
        try:
            daily_metrics_df = scans_df.join(
                downloads_df.set_index('slug'), on='slug',
                how='outer').reset_index(drop=True).join(
                    plays_df.set_index('slug'),
                    on='slug',
                    how='outer',
                    rsuffix='_plays').fillna(0)
            daily_metrics_df['Date'] = '-'.join(
                date_.strftime('%Y-%m-%d').split('-')[::-1])
        except Exception:
            raise Exception('Daily Metrics Error!')
        try:
            overall = daily_metrics_df[[
                'Successful QR Scans', 'Failed QR Scans',
                'Total Content Downloads', 'Total Content Plays on App',
                'Total Content Plays on Portal',
                'Total Devices that played content on App',
                'Total Devices that played content on Portal',
                'Content Play Time on App (in hours)',
                'Content Play Time on Portal (in hours)'
            ]].sum().astype(int)
            overall['Total App Sessions'] = app_df['Total App Sessions'].loc[0]
            overall['Total Devices on App'] = app_df[
                'Total Devices on App'].loc[0]
            overall['Total Time on App (in hours)'] = app_df[
                'Total Time on App (in hours)'].loc[0]
            overall['Date'] = '-'.join(
                date_.strftime('%Y-%m-%d').split('-')[::-1])
            overall['Unmapped QR Scans'] = unmapped
            overall[
                'Total QR scans'] = overall['Successful QR Scans'] + overall[
                    'Failed QR Scans'] + overall['Unmapped QR Scans']
            overall['Percentage (%) of Failed QR Scans'] = '%.2f' % (
                overall['Failed QR Scans'] * 100 / overall['Total QR scans'])
            overall['Percentage (%) of Unmapped QR Scans'] = '%.2f' % (
                overall['Unmapped QR Scans'] * 100 / overall['Total QR scans'])
            overall['Total Content Plays'] = overall[
                'Total Content Plays on App'] + overall[
                    'Total Content Plays on Portal']
            overall['Total Devices that played content'] = overall[
                'Total Devices that played content on App'] + overall[
                    'Total Devices that played content on Portal']
            overall['Total Content Play Time (in hours)'] = overall[
                'Content Play Time on App (in hours)'] + overall[
                    'Content Play Time on Portal (in hours)']
            overall = overall[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Unmapped QR Scans',
                'Percentage (%) of Failed QR Scans',
                'Percentage (%) of Unmapped QR Scans',
                'Total Content Downloads', 'Total App Sessions',
                'Total Devices on App', 'Total Time on App (in hours)',
                'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            read_loc_.joinpath('portal_dashboards',
                               'overall').mkdir(exist_ok=True)
            read_loc_.joinpath('portal_dashboards',
                               'mhrd').mkdir(exist_ok=True)
            try:
                get_data_from_blob(
                    read_loc_.joinpath('portal_dashboards', 'overall',
                                       'daily_metrics.csv'))
                blob_data = pd.read_csv(
                    read_loc_.joinpath('portal_dashboards', 'overall',
                                       'daily_metrics.csv'))
            except:
                blob_data = pd.DataFrame()
            blob_data = blob_data.append(pd.DataFrame(overall).transpose(),
                                         sort=False).fillna('')
            blob_data.index = pd.to_datetime(blob_data.Date, format='%d-%m-%Y')
            blob_data.drop_duplicates('Date', inplace=True, keep='last')
            blob_data.sort_index(inplace=True)
            # can remove after first run
            blob_data = blob_data[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Unmapped QR Scans',
                'Percentage (%) of Failed QR Scans',
                'Percentage (%) of Unmapped QR Scans',
                'Total Content Downloads', 'Total App Sessions',
                'Total Devices on App', 'Total Time on App (in hours)',
                'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            blob_data.to_csv(read_loc_.joinpath('portal_dashboards', 'overall',
                                                'daily_metrics.csv'),
                             index=False)
            create_json(
                read_loc_.joinpath('portal_dashboards', 'overall',
                                   'daily_metrics.csv'))
            post_data_to_blob(
                read_loc_.joinpath('portal_dashboards', 'overall',
                                   'daily_metrics.csv'))
        except Exception:
            raise Exception('Overall Metrics Error!')
        try:
            daily_metrics_df['Total Content Plays'] = daily_metrics_df[
                'Total Content Plays on App'] + daily_metrics_df[
                    'Total Content Plays on Portal']
            daily_metrics_df['Total Devices that played content'] = daily_metrics_df[
                                                                        'Total Devices that played content on App'] + \
                                                                    daily_metrics_df[
                                                                        'Total Devices that played content on Portal']
            daily_metrics_df['Total Content Play Time (in hours)'] = daily_metrics_df[
                                                                         'Content Play Time on App (in hours)'] + \
                                                                     daily_metrics_df[
                                                                         'Content Play Time on Portal (in hours)']
            daily_metrics_df.set_index(['slug'], inplace=True)
            daily_metrics_df = daily_metrics_df[[
                'Date', 'Total QR scans', 'Successful QR Scans',
                'Failed QR Scans', 'Percentage (%) of Failed QR Scans',
                'Total Content Downloads', 'Total Content Plays on App',
                'Total Devices that played content on App',
                'Content Play Time on App (in hours)',
                'Total Content Plays on Portal',
                'Total Devices that played content on Portal',
                'Content Play Time on Portal (in hours)',
                'Total Content Plays', 'Total Devices that played content',
                'Total Content Play Time (in hours)'
            ]]
            for slug, value in daily_metrics_df.iterrows():
                if slug != '':
                    read_loc_.joinpath('portal_dashboards',
                                       slug).mkdir(exist_ok=True)
                    for key, val in value.items():
                        if key not in [
                                'Date', 'Percentage (%) of Failed QR Scans'
                        ]:
                            value[key] = int(val)
                        elif key == 'Percentage (%) of Failed QR Scans':
                            value[key] = '%.2f' % val
                    try:
                        get_data_from_blob(
                            read_loc_.joinpath('portal_dashboards', slug,
                                               'daily_metrics.csv'))
                        blob_data = pd.read_csv(
                            read_loc_.joinpath('portal_dashboards', slug,
                                               'daily_metrics.csv'))
                    except:
                        blob_data = pd.DataFrame()
                    blob_data = blob_data.append(
                        pd.DataFrame(value).transpose(), sort=False).fillna('')
                    blob_data.index = pd.to_datetime(blob_data.Date,
                                                     format='%d-%m-%Y')
                    blob_data.drop_duplicates('Date',
                                              inplace=True,
                                              keep='last')
                    blob_data.sort_index(inplace=True)
                    # can remove after first run
                    blob_data = blob_data[[
                        'Date', 'Total QR scans', 'Successful QR Scans',
                        'Failed QR Scans', 'Percentage (%) of Failed QR Scans',
                        'Total Content Downloads',
                        'Total Content Plays on App',
                        'Total Devices that played content on App',
                        'Content Play Time on App (in hours)',
                        'Total Content Plays on Portal',
                        'Total Devices that played content on Portal',
                        'Content Play Time on Portal (in hours)',
                        'Total Content Plays',
                        'Total Devices that played content',
                        'Total Content Play Time (in hours)'
                    ]]
                    blob_data.to_csv(read_loc_.joinpath(
                        'portal_dashboards', slug, 'daily_metrics.csv'),
                                     index=False)
                    create_json(
                        read_loc_.joinpath('portal_dashboards', slug,
                                           'daily_metrics.csv'))
                    post_data_to_blob(
                        read_loc_.joinpath('portal_dashboards', slug,
                                           'daily_metrics.csv'))
        except Exception:
            raise Exception('State Metrics Error!')
예제 #21
0
    def get_tbs(self):
        tb_url = "{}/api/content/v1/search".format(self.content_search)
        payload = """{
                    "request": {
                        "filters": {
                            "contentType": ["Textbook"],
                            "status": ["Live"]
                        },
                        "sort_by": {"createdOn":"desc"},
                        "limit": 10000
                    }
                }"""
        headers = {
            'content-type': "application/json; charset=utf-8",
            'cache-control': "no-cache"
        }
        retry_count = 0
        while 1:
            try:
                response = requests.request("POST",
                                            tb_url,
                                            data=payload,
                                            headers=headers)
                break
            except requests.exceptions.ConnectionError:
                print("Retry {} for textbook list".format(retry_count + 1))
                retry_count += 1
                sleep(10)
                if retry_count == 5:
                    print("Max retries reached...")
                    break

        list_of_textbooks = pd.DataFrame(response.json()['result']['content'])
        list_of_textbooks = list_of_textbooks[[
            'identifier', 'channel', 'board', 'gradeLevel', 'medium', 'name',
            'subject'
        ]]
        tb_list = list(list_of_textbooks.identifier.unique())
        list_of_textbooks.drop_duplicates(subset=['identifier'],
                                          keep='first',
                                          inplace=True)

        dialcode_df = pd.DataFrame()
        tb_count = 0

        for tb_id in tb_list:
            tb_count = tb_count + 1

            print("currently running for textbook number %d(%s)/%d" %
                  (tb_count, tb_id, len(tb_list)))

            retry_count = 0
            url = "{}/api/course/v1/hierarchy/{}".format(
                self.content_hierarchy, tb_id)
            while 1:
                try:
                    response = requests.request("GET", url, headers=headers)
                    break
                except requests.exceptions.ConnectionError:
                    print("Retry {} for TOC {}".format(retry_count + 1, tb_id))
                    retry_count += 1
                    sleep(10)
                    if retry_count == 5:
                        print("Max retries reached...")
                        print("Skipping the run for TB ID %s" % (tb_id))
                        failed_tbs.append(tb_id)
                        break

            if response.json()['result'] != {}:

                tb = response.json()['result']['content']

                if 'children' in tb:
                    pass
                else:
                    continue

                if tb['children'] == None:
                    continue

                if 'index' not in tb['children'][0]:
                    continue

                tree_obj = self.traverse(tb)
                importer = DictImporter()
                root = importer.import_(tree_obj)
                resources = findall(root,
                                    filter_=lambda node: node.contentType in
                                    ("Resource"))
                dialcodes = findall(root,
                                    filter_=lambda node: node.dialcode not in
                                    (""))

                dialcodes_with_content = []
                for resource in resources:
                    for ancestor in resource.ancestors:
                        dialcodes_with_content.append(
                            (ancestor.dialcode, ancestor.index))

                dialcodes_with_content = set(
                    [x for x in dialcodes_with_content if (x[0] != '')])

                dialcodes_all = []
                for dialcode in dialcodes:
                    dialcodes_all.append((dialcode.dialcode, dialcode.index))

                dialcodes_all = set([x for x in dialcodes_all if (x != '')])

                no_content = pd.DataFrame(list(dialcodes_all -
                                               dialcodes_with_content),
                                          columns=['QR', 'Index'])
                no_content['TB_ID'] = tb_id
                no_content['status'] = 'no content'

                with_content = pd.DataFrame(list(dialcodes_with_content),
                                            columns=['QR', 'Index'])
                with_content['TB_ID'] = tb_id
                with_content['status'] = 'content linked'

                final_df = with_content.copy()
                final_df = final_df.append(no_content)

                final_df['Index'].fillna(int(0), inplace=True)
                final_df['Index'].loc[final_df['Index'] == ''] = 0
                final_df.Index = final_df.Index.astype('category')
                final_df.Index.cat.reorder_categories(natsorted(
                    set(final_df.Index)),
                                                      inplace=True,
                                                      ordered=True)
                final_df_sorted_by_index = final_df.sort_values('Index')

                ranks_to_be_assigned_for_positions_of_QR = list(
                    range(len(final_df_sorted_by_index.QR) + 1))[1:]

                final_df_ranked_for_QR = final_df_sorted_by_index

                final_df_ranked_for_QR[
                    'Position of QR in a TB'] = ranks_to_be_assigned_for_positions_of_QR
                final_df_ranked_for_QR[
                    'Position of QR in a TB'] = final_df_ranked_for_QR[
                        'Position of QR in a TB'].astype(int)

                dialcode_df = dialcode_df.append(final_df_sorted_by_index,
                                                 ignore_index=True)

        dialcode_state = dialcode_df.merge(list_of_textbooks,
                                           how='left',
                                           left_on='TB_ID',
                                           right_on='identifier')
        dialcode_state_final = dialcode_state[[
            'board', 'gradeLevel', 'QR', 'medium', 'subject', 'TB_ID', 'name',
            'status', 'Index', 'Position of QR in a TB', 'channel'
        ]]

        execution_date_str = datetime.strptime(self.execution_date,
                                               "%d/%m/%Y").strftime('%Y-%m-%d')

        os.makedirs(self.data_store_location.joinpath('tb_metadata',
                                                      execution_date_str),
                    exist_ok=True)
        dialcode_state_final.to_csv(self.data_store_location.joinpath(
            'tb_metadata', execution_date_str, 'qr_code_state.csv'),
                                    index=False,
                                    encoding='UTF-8')
        post_data_to_blob(self.data_store_location.joinpath(
            'tb_metadata', execution_date_str, 'qr_code_state.csv'),
                          backup=True)
예제 #22
0
    def generate_report(self):
        execution_date_str = datetime.strptime(self.execution_date,
                                               "%d/%m/%Y").strftime('%Y-%m-%d')
        week_last_date = (datetime.strptime(self.execution_date, "%d/%m/%Y") -
                          timedelta(1)).strftime('%d/%m/%Y')

        board_slug = pd.read_csv(
            self.data_store_location.joinpath(
                'textbook_reports', execution_date_str,
                'tenant_info.csv'))[['id', 'slug']]
        board_slug.set_index('slug', inplace=True)

        scans_df = pd.read_csv(
            self.data_store_location.joinpath('textbook_reports',
                                              execution_date_str,
                                              'weekly_dialcode_counts.csv'))
        scans_df["edata_filters_dialcodes"] = scans_df[
            "edata_filters_dialcodes"].str.upper().str.strip()
        scans_df = scans_df.groupby("edata_filters_dialcodes").agg({
            "Total Scans":
            "sum"
        }).reset_index()

        tb_dial_df = pd.read_csv(
            self.data_store_location.joinpath('tb_metadata',
                                              execution_date_str,
                                              'qr_code_state.csv'))
        tb_dial_df["QR"] = tb_dial_df["QR"].str.upper().str.strip()

        tb_dial_scans_df = pd.merge(scans_df,
                                    tb_dial_df,
                                    left_on="edata_filters_dialcodes",
                                    right_on="QR")
        tb_dial_scans_df['Index'] = tb_dial_scans_df['Index'].str.split(
            '.').str[0].astype(int)

        tb_dial_scans_df.groupby(["channel", "TB_ID", "Index"]).agg({
            "Total Scans":
            "sum"
        }).reset_index()
        tb_dial_scans_df['weighted_scans'] = tb_dial_scans_df[
            'Index'] * tb_dial_scans_df['Total Scans']

        weighted_avg_df = tb_dial_scans_df.groupby("channel").agg({
            "Total Scans":
            "sum",
            "weighted_scans":
            "sum"
        })

        weighted_avg_df['weighted_average'] = weighted_avg_df[
            'weighted_scans'] / weighted_avg_df['Total Scans']
        weighted_avg_df['weighted_average'] = weighted_avg_df[
            'weighted_average'].round(1)
        weighted_avg_df = weighted_avg_df.reset_index()[[
            'channel', 'weighted_average'
        ]]
        weighted_avg_df.rename(columns={"weighted_average": "Index"},
                               inplace=True)
        weighted_avg_df['Date'] = week_last_date

        for slug, board_value in board_slug.iterrows():
            print(slug)
            try:
                get_data_from_blob(
                    self.data_store_location.joinpath("portal_dashboards",
                                                      slug,
                                                      "gps_learning.csv"))
                blob_data = pd.read_csv(
                    self.data_store_location.joinpath("portal_dashboards",
                                                      slug,
                                                      "gps_learning.csv"))
            except Exception:
                blob_data = pd.DataFrame(columns=["Date", "Index"])

            current_channel_df = weighted_avg_df[weighted_avg_df['channel'] ==
                                                 board_value.id][[
                                                     "Date", "Index"
                                                 ]]

            blob_data = pd.concat([blob_data, current_channel_df])
            blob_data.drop_duplicates(subset=['Date'],
                                      keep='last',
                                      inplace=True)
            blob_data.to_csv(self.data_store_location.joinpath(
                'portal_dashboards', slug, 'gps_learning.csv'),
                             index=False)
            create_json(
                self.data_store_location.joinpath('portal_dashboards', slug,
                                                  'gps_learning.csv'))
            post_data_to_blob(
                self.data_store_location.joinpath('portal_dashboards', slug,
                                                  'gps_learning.csv'))
    def generate_reports(self, result_loc_, content_search_,
                         content_hierarchy_, date_):
        """
        generate the overall ETB and DCE reports at textbook and detailed levels
        :param hostname:IP and port to query the list of textbooks and hierarchy
        :param result_loc_: location to store data
        :return: None
        """
        board_slug = pd.read_csv(
            self.data_store_location.joinpath(
                'textbook_reports', date_.strftime('%Y-%m-%d'),
                'tenant_info.csv'))[['id', 'slug']]
        board_slug.set_index('id', inplace=True)

        importer = DictImporter()
        dialcode_etb = []
        textbook_etb = []
        dialcode_dce = []
        textbook_dce = []
        scans_df = pd.read_csv(
            result_loc_.joinpath('textbook_reports', 'dialcode_counts.csv'))
        scans_df = scans_df.groupby(
            'edata_filters_dialcodes')['Total Scans'].sum()
        tb_url = "{}v3/search".format(content_search_)
        payload = """{
                "request": {
                    "filters": {
                        "contentType": ["Textbook"],
                        "status": ["Live", "Review", "Draft"]
                    },
                    "sort_by": {"createdOn":"desc"},
                    "limit": 10000
                }
            }"""
        tb_headers = {
            'content-type': "application/json; charset=utf-8",
            'cache-control': "no-cache"
        }
        retry_count = 0
        while retry_count < 5:
            try:
                response = requests.request("POST",
                                            tb_url,
                                            data=payload,
                                            headers=tb_headers)
                textbooks = pd.DataFrame(
                    response.json()['result']['content'])[[
                        'identifier', 'createdFor', 'createdOn',
                        'lastUpdatedOn', 'board', 'medium', 'gradeLevel',
                        'subject', 'name', 'status', 'channel'
                    ]]
                textbooks[textbooks.duplicated(
                    subset=['identifier', 'status'])].to_csv(
                        result_loc_.joinpath('textbook_reports',
                                             date_.strftime('%Y-%m-%d'),
                                             'duplicate_tb.csv'),
                        index=False)
                textbooks.drop_duplicates(subset=['identifier', 'status'],
                                          inplace=True)
                textbooks['gradeLevel'] = textbooks['gradeLevel'].apply(
                    lambda x: ['Unknown'] if type(x) == float else x)
                textbooks.fillna({'createdFor': ' '}, inplace=True)
                textbooks.fillna('Unknown', inplace=True)
                textbooks['grade'] = textbooks['gradeLevel'].apply(
                    lambda grade: ', '.join(
                        [y if y == 'KG' else y.title() for y in grade]))
                textbooks.to_csv(result_loc_.joinpath(
                    'textbook_reports', date_.strftime('%Y-%m-%d'),
                    'tb_list.csv'),
                                 index=False)
                break
            except requests.exceptions.ConnectionError:
                print("Retry {} for textbook list".format(retry_count + 1))
                retry_count += 1
                time.sleep(10)
        else:
            print("Max retries reached...")
            return
        counter = 0
        skipped_tbs = []
        for ind_, row_ in textbooks.iterrows():
            counter += 1
            print('Running for {} out of {}: {}% ({} sec/it)'.format(
                counter, textbooks.shape[0],
                '%.2f' % (counter * 100 / textbooks.shape[0]),
                '%.2f' % ((datetime.now() - self.start_time).total_seconds() /
                          counter)))
            if isinstance(row_['gradeLevel'], list) and len(
                    row_['gradeLevel']) == 0:
                row_['gradeLevel'].append(' ')
            if row_['status'] == 'Live':
                url = "{}learning-service/content/v3/hierarchy/{}".format(
                    content_hierarchy_, row_['identifier'])
            else:
                url = "{}learning-service/content/v3/hierarchy/{}?mode=edit".format(
                    content_hierarchy_, row_['identifier'])
            retry_count = 0
            while retry_count < 5:
                try:
                    response = requests.get(url)
                    tb = response.json()['result']['content']
                    tree_obj = self.parse_etb(tb, row_)
                    root = importer.import_(tree_obj)
                    self.etb_dialcode(row_, (root, ) + root.descendants,
                                      dialcode_etb)
                    self.etb_textbook(row_, root, textbook_etb)
                    if row_['status'] == 'Live':
                        chapters = findall(
                            root, filter_=lambda node: node.depth == 1)
                        for i in range(len(chapters)):
                            term = 'T1' if i <= (len(chapters) / 2) else 'T2'
                            chapters[i].term = term
                            for descendant in chapters[i].descendants:
                                descendant.term = term
                        root.term = 'T1'
                        dialcode_wo_content = findall(
                            root,
                            filter_=lambda node: node.dialcode != '' and node.
                            leafNodesCount == 0)
                        self.dce_dialcode(row_, dialcode_wo_content,
                                          dialcode_dce)
                        self.dce_textbook(row_, root, textbook_dce)
                    break
                except requests.exceptions.ConnectionError:
                    retry_count += 1
                    print("ConnectionError: Retry {} for textbook {}".format(
                        retry_count, row_['identifier']))
                    time.sleep(10)
                except KeyError:
                    with open(
                            result_loc_.joinpath('textbook_reports',
                                                 date_.strftime('%Y-%m-%d'),
                                                 'etb_error_log.log'),
                            'a') as f:
                        f.write(
                            "KeyError: Resource not found for textbook {} in {}\n"
                            .format(row_['identifier'], row_['status']))
                    break
            else:
                print("Max retries reached...")
                continue
            if retry_count == 5:
                skipped_tbs.append(row_)
                continue
            if response.status_code != 200:
                continue

        etb_dc = pd.DataFrame(dialcode_etb)
        etb_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_dialcode_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'ETB_dialcode_data_pre.csv'),
                          backup=True)
        etb_tb = pd.DataFrame(textbook_etb).fillna('')
        etb_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_textbook_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'ETB_textbook_data_pre.csv'),
                          backup=True)
        dce_dc = pd.DataFrame(dialcode_dce)
        dce_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_dialcode_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'DCE_dialcode_data_pre.csv'),
                          backup=True)
        dce_tb = pd.DataFrame(textbook_dce).fillna('')
        dce_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_textbook_data_pre.csv'),
                      index=False,
                      encoding='utf-8-sig')
        post_data_to_blob(result_loc_.joinpath('textbook_reports',
                                               date_.strftime('%Y-%m-%d'),
                                               'DCE_textbook_data_pre.csv'),
                          backup=True)
        channels = set()
        for c in etb_dc.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        for c in etb_tb.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        for c in dce_dc.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        for c in dce_tb.channel.unique():
            if c in board_slug.index:
                channels.add(c)
        channels = list(channels)
        etb_dc = etb_dc.join(scans_df, on='QR Code', how='left').fillna('')
        etb_dc.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        etb_dc = etb_dc[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Textbook Status', 'Type of Node', 'Level 1 Name',
            'Level 2 Name', 'Level 3 Name', 'Level 4 Name', 'Level 5 Name',
            'QR Code', 'Total Scans', 'Number of contents'
        ]]
        etb_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_dialcode_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        etb_tb.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        etb_tb = etb_tb[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Textbook Status', 'Created On',
            'Last Updated On', 'Total content linked',
            'Total QR codes linked to content',
            'Total number of QR codes with no linked content',
            'Total number of leaf nodes',
            'Number of leaf nodes with no content', 'With QR codes'
        ]]
        etb_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'ETB_textbook_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        dce_dc = dce_dc.join(scans_df, on='QR Code', how='left').fillna('')
        dce_dc.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        dce_dc = dce_dc[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Level 1 Name', 'Level 2 Name', 'Level 3 Name',
            'Level 4 Name', 'Level 5 Name', 'QR Code', 'Total Scans', 'Term'
        ]]
        dce_dc.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_dialcode_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        dce_tb.sort_values(
            by=['channel', 'Medium', 'grade_sort', 'Subject', 'Textbook Name'],
            inplace=True)
        dce_tb = dce_tb[[
            'Textbook ID', 'channel', 'Medium', 'Grade', 'Subject',
            'Textbook Name', 'Created On', 'Last Updated On',
            'Total number of QR codes',
            'Number of QR codes with atleast 1 linked content',
            'Number of QR codes with no linked content',
            'Term 1 QR Codes with no linked content',
            'Term 2 QR Codes with no linked content'
        ]]
        dce_tb.to_csv(result_loc_.joinpath('textbook_reports',
                                           date_.strftime('%Y-%m-%d'),
                                           'DCE_textbook_data.csv'),
                      index=False,
                      encoding='utf-8-sig')
        for channel in channels:
            slug = board_slug.loc[channel]['slug']
            df_etb_dc = etb_dc[etb_dc['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            etb_dc_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'ETB_dialcode_data.csv')
            df_etb_dc.drop('channel', axis=1).to_csv(etb_dc_path,
                                                     index=False,
                                                     encoding='utf-8-sig')
            create_json(etb_dc_path)
            post_data_to_blob(etb_dc_path)
            df_etb_tb = etb_tb[etb_tb['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            etb_tb_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'ETB_textbook_data.csv')
            self.etb_aggregates(result_loc_, slug, df_etb_tb)
            df_etb_tb.drop(['channel', 'With QR codes'],
                           axis=1).to_csv(etb_tb_path,
                                          index=False,
                                          encoding='utf-8-sig')
            create_json(etb_tb_path)
            post_data_to_blob(etb_tb_path)
            df_dce_dc = dce_dc[dce_dc['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            dce_dc_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'DCE_dialcode_data.csv')
            df_dce_dc.drop('channel', axis=1).to_csv(dce_dc_path,
                                                     index=False,
                                                     encoding='utf-8-sig')
            create_json(dce_dc_path)
            post_data_to_blob(dce_dc_path)
            df_dce_tb = dce_tb[dce_tb['channel'] == channel]
            result_loc_.joinpath('portal_dashboards',
                                 slug).mkdir(exist_ok=True)
            dce_tb_path = result_loc_.joinpath('portal_dashboards', slug,
                                               'DCE_textbook_data.csv')
            try:
                self.dce_aggregates(result_loc_, slug, df_dce_tb)
            except IndexError:
                pass
            df_dce_tb.drop('channel', axis=1).to_csv(dce_tb_path,
                                                     index=False,
                                                     encoding='utf-8-sig')
            create_json(dce_tb_path)
            post_data_to_blob(dce_tb_path)
        if skipped_tbs:
            with open(
                    result_loc_.joinpath('textbook_reports',
                                         date_.strftime('%Y-%m-%d'),
                                         'etb_error_log.log'), 'a') as f:
                for tb_id in skipped_tbs:
                    f.write(
                        'ConnectionError: Failed to fetch Hierarchy for {} in {} state.\n'
                        .format(tb_id['identifier'], tb_id['status']))