def main(req: func.HttpRequest) -> func.HttpResponse: logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s', level=logging.DEBUG) try: log.info(f"Http trigger. req.params: {req.params}") date = datetime.datetime.strptime( get_req_param(req, 'date', default=DEFAULT_DAY), DAY_FORMAT) company = get_req_param(req, 'company', default=Config().default_company) df = ReposRankingMTD(date=date, company=company).read() out_df = filter_projects( df=df, projects_filter_list=DataLake().staging.load_projects_filter(), commits_amount_field=DataLake( ).public.schemas.repo_commits_ranking.commits, repo_name_field=DataLake( ).public.schemas.repo_commits_ranking.repo) DataLake().public.save_report(report_df=out_df, report_name='projects_activity_MTD', date=date, company=company) return func.HttpResponse(f'{{"output": "{out_df}"}}') except Exception as ex: log.error(f'Exception {ex}') log.exception(ex) return func.HttpResponse( f"This HTTP triggered function failed {ex} " f"{''.join(tb.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__))}", status_code=500)
def get_change_report(date: datetime): report_name = 'OSCI_ranking_YTD' rank_field = 'Position' change_suffix = 'Change' output_report_name = 'OSCI_change_ranking' previous_date = __get_previous_date(date=date) report_schema = DataLake().public.schemas.company_contributors_ranking new_report = DataLake().public.get_report(report_name=report_name, date=date).reset_index().\ rename(columns={'index': rank_field}) old_report = DataLake().public.get_report(report_name=report_name, date=previous_date).reset_index().\ rename(columns={'index': rank_field}) change_report = get_osci_ranking_change_report( old_report=old_report, new_report=new_report, company_field=report_schema.company, active_contributors_field=report_schema.active, total_community_field=report_schema.total, rank_field=rank_field, change_suffix=change_suffix) DataLake().public.save_report(report_df=change_report, report_name=output_report_name, date=date) DataLake().public.save_solutions_hub_osci_change_report_view( change_report=change_report, report_dir_name='SolutionsHub_' + output_report_name, old_date=previous_date, new_date=date)
def main(req: func.HttpRequest) -> func.HttpResponse: try: log.info(f"Http trigger. req.params: {req.params}") date = datetime.datetime.strptime( get_req_param(req, 'date', default=DEFAULT_DAY), DAY_FORMAT) for blob_path in DataLake().staging.get_push_events_commits_paths( to_date=date): df = DataLake().staging.get_push_events_commits(blob_path)[ PushEventsCommitsSchema.required] job_results = DataLake().big_query.load_dataframe( df=df, table_id=BigQueryPushEventsCommitsColumns.table_id, schema=BigQueryPushEventsCommitsColumns.schema) logging.info("{} rows and {} columns in {}".format( job_results.num_rows, len(job_results.schema), BigQueryPushEventsCommitsColumns.table_id)) except Exception as ex: log.error(f'Exception {ex}') log.exception(ex) return func.HttpResponse( f"This HTTP triggered function failed {ex} " f"{''.join(tb.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__))}", status_code=500) finally: return func.HttpResponse(f"This HTTP triggered function executed")
def __init__(self, sheet_name: str, from_date: datetime, to_date: datetime, top_size: int): self.writer, self.buffer = DataLake().public.get_excel_writer() self.workbook: Workbook = self.writer.book self.worksheet: Worksheet = self.workbook.add_worksheet(sheet_name) self.from_date = from_date self.to_date = to_date self.top_size = top_size self.superscript_format = self.get_format(self.superscript_format_rule)
def load_repositories(date: datetime) -> pd.DataFrame: log.debug(f'Load repositories information for {date:%Y-%m-%d}') repositories = Repositories(date=date) df = pd.DataFrame(data=[], columns=Repositories.schema.required) repositories_names = DataLake().landing.get_repositories(date=date) if not repositories_names.empty: df = _load_repositories(repos_names=repositories_names[ DataLake().landing.schemas.repositories_names.name]) repositories.save(df) return df
def load_osci_ranking_to_bq(date: datetime.datetime): report = DataLake().public.get_report(report_name='OSCI_ranking_YTD', date=date) report = report[PublicSchemas.company_contributors_ranking.required] report = report.reset_index().rename( columns={'index': BigQueryOSCIRankingReport.Columns.position}) report = report.rename(columns=BigQueryOSCIRankingReport.mapping) report[BigQueryOSCIRankingReport.Columns.date] = date.date() return DataLake().big_query.load_dataframe( df=report, table_id=BigQueryOSCIRankingReport.table_id, schema=BigQueryOSCIRankingReport.schema)
def get_github_daily_push_events(day: datetime.datetime): with GithubArchiveRest() as rest: for hour in range(24): log.info(f'Crawl events for {day}') day = day.replace(hour=hour) push_events_commits = get_hour_push_events_commits(day=day, rest=rest) DataLake().landing.save_push_events_commits( push_event_commits=push_events_commits, date=day)
def load_company_repositories_events_commits(date: datetime, company: str): events = DataLake().staging.get_push_events_commits(company=company, from_date=date, to_date=date, date_period_type=DatePeriodType.DTD) schema = DataLake().staging.schemas.push_commits if events.empty: log.warning(f'No {company} events at {date}') return with GithubRest(token=Config().github_token) as rest: company_commits = get_company_repositories_events_commits(repositories_names=events[schema.repo_name].unique(), date=date, company=company, rest=rest) company_commits_df = pd.DataFrame(company_commits) DataLake().staging.save_private_push_events_commits(push_event_commits=company_commits_df, company_name=company, date=date)
def filter_out_unlicensed(date: datetime): log.debug( f'Filter out unlicensed push events commits for date {date:%Y-%m-%d}') licensed_repos = Repositories(date=date) log.debug(f'Read licensed repos for date {date:%Y-%m-%d}') licensed_repos_df = licensed_repos.read() licensed_repos_set = frozenset( licensed_repos_df[licensed_repos.schema.name].tolist()) for company, df in DataLake().staging.get_daily_raw_push_events_commits( date): log.debug( f'Filter out unlicensed push events commits for date {date:%Y-%m-%d} for {company}' ) filtered_df = df[df[DataLake().staging.schemas.push_commits.repo_name]. isin(licensed_repos_set)] DataLake().staging.save_push_events_commits( push_event_commits=filtered_df, company_name=company, date=date)
def main(req: func.HttpRequest) -> func.HttpResponse: try: log.info(f"Http trigger. req.params: {req.params}") date = datetime.datetime.strptime(get_req_param(req, 'date', default=DEFAULT_DAY), DAY_FORMAT) hour = int(get_req_param(req, 'hour', default=0)) date = date.replace(hour=hour) df = DataLake().staging.get_push_events_commits(from_date=date, to_date=date, date_period_type=DatePeriodType.DTD) job_results = DataLake().big_query.load_dataframe(df=df, table_id=BigQueryPushEventsCommitsColumns.table_id, schema=BigQueryPushEventsCommitsColumns.schema) result = {'num_rows': job_results.num_rows, 'num_columns': len(job_results.schema), 'table_id': BigQueryPushEventsCommitsColumns.table_id} logging.info("{num_rows} rows and {num_columns} columns in {table_id}".format(**result)) func.HttpResponse(json.dumps({'status': 'SUCCESS', **result})) except Exception as ex: ex_message = (f'Exception {ex} \n' f'{"".join(tb.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__))}') log.error(ex_message) return func.HttpResponse(ex_message, status_code=500)
def load_osci_ranking_to_bq(date: datetime.datetime, date_period: str = DatePeriodType.YTD): if date_period not in (DatePeriodType.MTD, DatePeriodType.YTD): raise ValueError(f'Unsupported {date_period}') report = OSCIRankingFactory().get_cls(date_period=date_period)(date=date) table = date_period_to_table_map[date_period] log.debug( date.strftime(f'Load {report.name} for %Y-%m-%d to {table.table_id}')) report_df = report.read() report_df = report_df[PublicSchemas.company_contributors_ranking.required] report_df = report_df.reset_index().rename( columns={'index': table.Columns.position}) report_df[table.Columns.position] += 1 report_df = report_df.rename(columns=table.mapping) report_df[table.Columns.date] = date.date() return DataLake().big_query.load_dataframe(df=report_df, table_id=table.table_id, schema=table.schema)
def __init__(self, company: str = None, date_period_type: str = DatePeriodType.YTD): self.data_lake = DataLake() self.commits_schema = self.data_lake.staging.schemas.push_commits self.company = company self.date_period_type = date_period_type
def url(self) -> str: return DataLake().public.get_report_url(report_name=self.name, date=self.date)
def read_all(self): return DataLake().public\ .get_reports_for_last_days_of_month(report_name=self.name, date=self.date, company=self.company)
def save(self, df: pd.DataFrame): DataLake().public.write_bytes_to_file(path=self.path, buffer=self._write(df))
class OSCIChangeExcelWriter: schema = OSCIChangeRankingExcelSchema plus_minus_format_rule: Rule = {'num_format': '+0;-0;—'} numbers_format_rule: Rule = {'num_format': '0'} superscript_format_rule: Rule = {'font_script': 1} bold_format_rule: Rule = {'bold': True} blue_font_color_rule: Rule = {'font_color': '#2E75B5'} align_center_rule: Rule = {'align': 'center'} @staticmethod def border_rule_fabric(border_size: int) -> Dict[str, int]: return {'border': border_size} border_rule_1: Rule = {'border': 1} position_change_format_rule: Rule = reduce_rules((border_rule_1, bold_format_rule, blue_font_color_rule, plus_minus_format_rule, align_center_rule)) change_format_rule: Rule = reduce_rules((border_rule_1, blue_font_color_rule, plus_minus_format_rule)) number_cell_format_rule: Rule = reduce_rules((border_rule_1, numbers_format_rule)) table_columns: List[TableColumn] = [ TableColumn(schema.position, None, schema.position, reduce_rules((border_rule_1, bold_format_rule, align_center_rule)), cell_scale=5), TableColumn(schema.position_change, STAR_STAR_SUPERSCRIPT, OSCIChangeRankingSchema.position_change, position_change_format_rule, cell_scale=1.2), TableColumn(schema.company, None, OSCIChangeRankingSchema.company, border_rule_1, cell_scale=4), TableColumn(schema.active, NUMBER_SUPERSCRIPT(1), OSCIChangeRankingSchema.active, number_cell_format_rule, cell_scale=1.2), TableColumn(schema.change_suffix, STAR_STAR_SUPERSCRIPT, OSCIChangeRankingSchema.active_change, change_format_rule, cell_scale=1.2), TableColumn(schema.total, NUMBER_SUPERSCRIPT(2), OSCIChangeRankingSchema.total, number_cell_format_rule, cell_scale=1.2), TableColumn(schema.change_suffix, STAR_STAR_SUPERSCRIPT, OSCIChangeRankingSchema.total_change, change_format_rule, cell_scale=1.2) ] def __init__(self, sheet_name: str, from_date: datetime, to_date: datetime, top_size: int): self.writer, self.buffer = DataLake().public.get_excel_writer() self.workbook: Workbook = self.writer.book self.worksheet: Worksheet = self.workbook.add_worksheet(sheet_name) self.from_date = from_date self.to_date = to_date self.top_size = top_size self.superscript_format = self.get_format(self.superscript_format_rule) def get_format(self, *rules: Rule) -> Format: return self.workbook.add_format(reduce_rules(rules)) def write(self, df): header_position = Position(0, 1) table_header_position = Position(header_position.row + 2, header_position.col) comments_position = Position(table_header_position.row + 1, table_header_position.col + len(self.table_columns) + 1) table_position = Position(table_header_position.row + 1, table_header_position.col) self._write_header(position=header_position) self._write_table_header(start_from=table_header_position) self._write_comments(start_from=comments_position) self._write_table(df, start_from=table_position) def _write_header(self, position: Position = Position(0, 1)): self.worksheet.write(*position, f'{self.from_date:%Y} (differences from {self.from_date:%B, %d} to {self.to_date:%B, %d})') def _write_comments(self, start_from: Position = Position(3, 9)): row = start_from.row self.worksheet.write_rich_string(row, start_from.col, self.superscript_format, NUMBER_SUPERSCRIPT(1), ' Active Contributors are those who authored 10 ' 'or more pushes in the time period') row += 1 self.worksheet.write_rich_string(row, start_from.col, self.superscript_format, NUMBER_SUPERSCRIPT(2), ' Total Community counts those who authored 1 ' 'or more pushes in the time period') row += 1 self.worksheet.write_rich_string(row, start_from.col, self.superscript_format, STAR_STAR_SUPERSCRIPT, ' Changes are relative to the metrics at the end of the previous month') row += 2 self.worksheet.write(row, start_from.col, f'The top {self.top_size} is calculated using the Active Contributors metric') row += 1 self.worksheet.write(row, start_from.col, 'If two companies have equal Active Contributors, ' 'their relative positions are determined by Total Community') def _write_table_header(self, start_from: Position): row, col = start_from header_format = self.get_format(self.border_rule_1, self.bold_format_rule) superscript_format = self.get_format(self.bold_format_rule, self.superscript_format_rule) border_format = self.get_format(self.border_rule_1) center_header_format = self.get_format(self.border_rule_1, self.bold_format_rule, self.align_center_rule) for table_column in self.table_columns: if table_column.superscript_suffix: self.worksheet.write_rich_string(row, col, header_format, table_column.name, superscript_format, table_column.superscript_suffix, border_format) else: self.worksheet.write(row, col, table_column.name, center_header_format if table_column.name in {self.schema.position, self.schema.position_change} else header_format) col_width = len(table_column.name) * table_column.cell_scale self.worksheet.set_column(col, col, width=col_width) col += 1 def _write_table(self, df: pd.DataFrame, start_from: Position): row, start_col = start_from for record in df.head(self.top_size).to_dict('records'): col = start_col for table_column in self.table_columns: self.worksheet.write(row, col, record[table_column.df_key], self.get_format(table_column.format_rule)) col += 1 row += 1 def save(self) -> BytesIO: self.writer.save() return self.buffer
def path(self) -> str: return DataLake().public.get_osci_change_excel_report_path(base_report_name=self.base_name, report_dir_name=self.dir_name, date=self.to_date)
def path(self) -> str: return DataLake().public.get_report_path(report_name=self.name, date=self.date)
def save(self, df: pd.DataFrame): DataLake().public.save_report(report_df=df, report_name=self.name, date=self.date)
def read(self) -> pd.DataFrame: return DataLake().public.get_report(report_name=self.name, date=self.date)
def __init__(self, date_period_type: str = DatePeriodType.YTD): self.data_lake = DataLake() self.commits_schema = self.data_lake.staging.schemas.push_commits self.date_period_type = date_period_type self.report_cls: Type[Report] = self.REPORT_FACTORY().get_cls(date_period=self.date_period_type)