def main(req: func.HttpRequest) -> func.HttpResponse: logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s', level=logging.DEBUG) try: log.info(f"Http trigger. req.params: {req.params}") date = datetime.datetime.strptime( get_req_param(req, 'date', default=DEFAULT_DAY), DAY_FORMAT) company = get_req_param(req, 'company', default=Config().default_company) df = ReposRankingMTD(date=date, company=company).read() out_df = filter_projects( df=df, projects_filter_list=DataLake().staging.load_projects_filter(), commits_amount_field=DataLake( ).public.schemas.repo_commits_ranking.commits, repo_name_field=DataLake( ).public.schemas.repo_commits_ranking.repo) DataLake().public.save_report(report_df=out_df, report_name='projects_activity_MTD', date=date, company=company) return func.HttpResponse(f'{{"output": "{out_df}"}}') except Exception as ex: log.error(f'Exception {ex}') log.exception(ex) return func.HttpResponse( f"This HTTP triggered function failed {ex} " f"{''.join(tb.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__))}", status_code=500)
def get_change_report(date: datetime): report_name = 'OSCI_ranking_YTD' rank_field = 'Position' change_suffix = 'Change' output_report_name = 'OSCI_change_ranking' previous_date = __get_previous_date(date=date) report_schema = DataLake().public.schemas.company_contributors_ranking new_report = DataLake().public.get_report(report_name=report_name, date=date).reset_index().\ rename(columns={'index': rank_field}) old_report = DataLake().public.get_report(report_name=report_name, date=previous_date).reset_index().\ rename(columns={'index': rank_field}) change_report = get_osci_ranking_change_report( old_report=old_report, new_report=new_report, company_field=report_schema.company, active_contributors_field=report_schema.active, total_community_field=report_schema.total, rank_field=rank_field, change_suffix=change_suffix) DataLake().public.save_report(report_df=change_report, report_name=output_report_name, date=date) DataLake().public.save_solutions_hub_osci_change_report_view( change_report=change_report, report_dir_name='SolutionsHub_' + output_report_name, old_date=previous_date, new_date=date)
def main(req: func.HttpRequest) -> func.HttpResponse: try: log.info(f"Http trigger. req.params: {req.params}") date = datetime.datetime.strptime( get_req_param(req, 'date', default=DEFAULT_DAY), DAY_FORMAT) for blob_path in DataLake().staging.get_push_events_commits_paths( to_date=date): df = DataLake().staging.get_push_events_commits(blob_path)[ PushEventsCommitsSchema.required] job_results = DataLake().big_query.load_dataframe( df=df, table_id=BigQueryPushEventsCommitsColumns.table_id, schema=BigQueryPushEventsCommitsColumns.schema) logging.info("{} rows and {} columns in {}".format( job_results.num_rows, len(job_results.schema), BigQueryPushEventsCommitsColumns.table_id)) except Exception as ex: log.error(f'Exception {ex}') log.exception(ex) return func.HttpResponse( f"This HTTP triggered function failed {ex} " f"{''.join(tb.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__))}", status_code=500) finally: return func.HttpResponse(f"This HTTP triggered function executed")
def load_repositories(date: datetime) -> pd.DataFrame: log.debug(f'Load repositories information for {date:%Y-%m-%d}') repositories = Repositories(date=date) df = pd.DataFrame(data=[], columns=Repositories.schema.required) repositories_names = DataLake().landing.get_repositories(date=date) if not repositories_names.empty: df = _load_repositories(repos_names=repositories_names[ DataLake().landing.schemas.repositories_names.name]) repositories.save(df) return df
def load_osci_ranking_to_bq(date: datetime.datetime): report = DataLake().public.get_report(report_name='OSCI_ranking_YTD', date=date) report = report[PublicSchemas.company_contributors_ranking.required] report = report.reset_index().rename( columns={'index': BigQueryOSCIRankingReport.Columns.position}) report = report.rename(columns=BigQueryOSCIRankingReport.mapping) report[BigQueryOSCIRankingReport.Columns.date] = date.date() return DataLake().big_query.load_dataframe( df=report, table_id=BigQueryOSCIRankingReport.table_id, schema=BigQueryOSCIRankingReport.schema)
def __init__(self, sheet_name: str, from_date: datetime, to_date: datetime, top_size: int): self.writer, self.buffer = DataLake().public.get_excel_writer() self.workbook: Workbook = self.writer.book self.worksheet: Worksheet = self.workbook.add_worksheet(sheet_name) self.from_date = from_date self.to_date = to_date self.top_size = top_size self.superscript_format = self.get_format(self.superscript_format_rule)
def get_github_daily_push_events(day: datetime.datetime): with GithubArchiveRest() as rest: for hour in range(24): log.info(f'Crawl events for {day}') day = day.replace(hour=hour) push_events_commits = get_hour_push_events_commits(day=day, rest=rest) DataLake().landing.save_push_events_commits( push_event_commits=push_events_commits, date=day)
def load_company_repositories_events_commits(date: datetime, company: str): events = DataLake().staging.get_push_events_commits(company=company, from_date=date, to_date=date, date_period_type=DatePeriodType.DTD) schema = DataLake().staging.schemas.push_commits if events.empty: log.warning(f'No {company} events at {date}') return with GithubRest(token=Config().github_token) as rest: company_commits = get_company_repositories_events_commits(repositories_names=events[schema.repo_name].unique(), date=date, company=company, rest=rest) company_commits_df = pd.DataFrame(company_commits) DataLake().staging.save_private_push_events_commits(push_event_commits=company_commits_df, company_name=company, date=date)
def filter_out_unlicensed(date: datetime): log.debug( f'Filter out unlicensed push events commits for date {date:%Y-%m-%d}') licensed_repos = Repositories(date=date) log.debug(f'Read licensed repos for date {date:%Y-%m-%d}') licensed_repos_df = licensed_repos.read() licensed_repos_set = frozenset( licensed_repos_df[licensed_repos.schema.name].tolist()) for company, df in DataLake().staging.get_daily_raw_push_events_commits( date): log.debug( f'Filter out unlicensed push events commits for date {date:%Y-%m-%d} for {company}' ) filtered_df = df[df[DataLake().staging.schemas.push_commits.repo_name]. isin(licensed_repos_set)] DataLake().staging.save_push_events_commits( push_event_commits=filtered_df, company_name=company, date=date)
def main(req: func.HttpRequest) -> func.HttpResponse: try: log.info(f"Http trigger. req.params: {req.params}") date = datetime.datetime.strptime(get_req_param(req, 'date', default=DEFAULT_DAY), DAY_FORMAT) hour = int(get_req_param(req, 'hour', default=0)) date = date.replace(hour=hour) df = DataLake().staging.get_push_events_commits(from_date=date, to_date=date, date_period_type=DatePeriodType.DTD) job_results = DataLake().big_query.load_dataframe(df=df, table_id=BigQueryPushEventsCommitsColumns.table_id, schema=BigQueryPushEventsCommitsColumns.schema) result = {'num_rows': job_results.num_rows, 'num_columns': len(job_results.schema), 'table_id': BigQueryPushEventsCommitsColumns.table_id} logging.info("{num_rows} rows and {num_columns} columns in {table_id}".format(**result)) func.HttpResponse(json.dumps({'status': 'SUCCESS', **result})) except Exception as ex: ex_message = (f'Exception {ex} \n' f'{"".join(tb.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__))}') log.error(ex_message) return func.HttpResponse(ex_message, status_code=500)
def load_osci_ranking_to_bq(date: datetime.datetime, date_period: str = DatePeriodType.YTD): if date_period not in (DatePeriodType.MTD, DatePeriodType.YTD): raise ValueError(f'Unsupported {date_period}') report = OSCIRankingFactory().get_cls(date_period=date_period)(date=date) table = date_period_to_table_map[date_period] log.debug( date.strftime(f'Load {report.name} for %Y-%m-%d to {table.table_id}')) report_df = report.read() report_df = report_df[PublicSchemas.company_contributors_ranking.required] report_df = report_df.reset_index().rename( columns={'index': table.Columns.position}) report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date.date() return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def read_all(self): return DataLake().public\ .get_reports_for_last_days_of_month(report_name=self.name, date=self.date, company=self.company)
def __init__(self, company: str = None, date_period_type: str = DatePeriodType.YTD): self.data_lake = DataLake() self.commits_schema = self.data_lake.staging.schemas.push_commits self.company = company self.date_period_type = date_period_type
def save(self, df: pd.DataFrame): DataLake().public.write_bytes_to_file(path=self.path, buffer=self._write(df))
def url(self) -> str: return DataLake().public.get_report_url(report_name=self.name, date=self.date)
def path(self) -> str: return DataLake().public.get_osci_change_excel_report_path(base_report_name=self.base_name, report_dir_name=self.dir_name, date=self.to_date)
def path(self) -> str: return DataLake().public.get_report_path(report_name=self.name, date=self.date)
def save(self, df: pd.DataFrame): DataLake().public.save_report(report_df=df, report_name=self.name, date=self.date)
def read(self) -> pd.DataFrame: return DataLake().public.get_report(report_name=self.name, date=self.date)
def __init__(self, date_period_type: str = DatePeriodType.YTD): self.data_lake = DataLake() self.commits_schema = self.data_lake.staging.schemas.push_commits self.date_period_type = date_period_type self.report_cls: Type[Report] = self.REPORT_FACTORY().get_cls(date_period=self.date_period_type)