Exemplo n.º 1
0
def load_licensed_repositories_to_bq(
        date: datetime.datetime) -> bigquery.table.Table:
    """Load licensed repositories to BigQuery for a given day"""
    df = DataLake().staging.get_repositories(date)
    return DataLake().big_query.load_dataframe(
        df=df,
        table_id=BigQueryLicensedRepository.table_id,
        schema=BigQueryLicensedRepository.schema)
Exemplo n.º 2
0
def staging_repository_df():
    return pd.DataFrame([
        {
            DataLake().staging.schemas.repositories.name: "epam/OSCI",
            DataLake().staging.schemas.repositories.language: "Python",
            DataLake().staging.schemas.repositories.license: "gpl-3.0",
            DataLake().staging.schemas.repositories.downloaded_at: "2021-01-01"
        },
    ])
Exemplo n.º 3
0
    def __init__(self, sheet_name: str, from_date: datetime, to_date: datetime, top_size: int):
        self.writer, self.buffer = DataLake().public.get_excel_writer()
        self.workbook: Workbook = self.writer.book
        self.worksheet: Worksheet = self.workbook.add_worksheet(sheet_name)
        self.from_date = from_date
        self.to_date = to_date
        self.top_size = top_size

        self.superscript_format = self.get_format(self.superscript_format_rule)
def get_daily_active_repositories(date: datetime.datetime) -> pd.DataFrame:
    df = DataLake().staging.get_union_daily_raw_push_events_commits(date=date)
    result_df = df[[LandingSchemas.push_commits.repo_name]].drop_duplicates()
    result_df = result_df[
        result_df.apply(lambda row: not Blacklist().is_blocked_repo_by_account(
            repository_name=row[LandingSchemas.push_commits.repo_name]
        ), axis=1)
    ]
    DataLake().landing.save_repositories(df=result_df, date=date)
    return result_df
Exemplo n.º 5
0
def filter_out_unlicensed(date: datetime):
    """Read row PEC, filter and save them with license, language

    :param date: push events on this day
    """
    log.debug(
        f'Filter out unlicensed push events commits for date {date:%Y-%m-%d}')
    log.debug(f'Read licensed repos for date {date:%Y-%m-%d}')
    licensed_repos_df = Repositories(date=date).read()

    for company, df in DataLake().staging.get_daily_raw_push_events_commits(
            date):
        log.debug(
            f'Filter out unlicensed push events commits for date {date:%Y-%m-%d} for {company}'
        )
        filtered_df = filter_and_adjunct_push_event_commit(
            df,
            licensed_repos_df,
            [DataLake().staging.schemas.repositories.license], [
                DataLake().staging.schemas.repositories.name,
                DataLake().staging.schemas.repositories.language,
                DataLake().staging.schemas.repositories.license
            ],
            DataLake().staging.schemas.push_commits.required,
            right_index=DataLake().staging.schemas.repositories.name,
            left_index=DataLake().staging.schemas.push_commits.repo_name)
        if not filtered_df.empty:
            DataLake().staging.save_push_events_commits(
                push_event_commits=filtered_df,
                company_name=company,
                date=date)
Exemplo n.º 6
0
def load_repositories(date: datetime) -> pd.DataFrame:
    log.debug(f'Load repositories information for {date:%Y-%m-%d}')
    repositories = Repositories(date=date)
    df = pd.DataFrame(data=[], columns=Repositories.schema.required)
    repositories_names = DataLake().landing.get_repositories(date=date)

    if not repositories_names.empty:
        df = _load_repositories(repos_names=repositories_names[
            DataLake().landing.schemas.repositories_names.name])

    repositories.save(df)
    return df
 def _execute(self, day: datetime, company: str):
     df = ReposRankingMTD(date=day, company=company).read()
     out_df = filter_projects(
         df=df,
         projects_filter_list=DataLake().staging.load_projects_filter(),
         commits_amount_field=DataLake(
         ).public.schemas.repo_commits_ranking.commits,
         repo_name_field=DataLake(
         ).public.schemas.repo_commits_ranking.repo)
     DataLake().public.save_report(report_df=out_df,
                                   report_name='projects_activity_MTD',
                                   date=day,
                                   company=company)
Exemplo n.º 8
0
def load_push_events_to_bq(date: datetime.datetime,
                           hour: int) -> Dict[str, Dict[str, Any]]:
    date = date.replace(hour=hour)
    df = DataLake().staging.get_push_events_commits(
        from_date=date, to_date=date, date_period_type=DatePeriodType.DTD)
    job_results = DataLake().big_query.load_dataframe(
        df=df,
        table_id=BigQueryPushEventsCommitsColumns.table_id,
        schema=BigQueryPushEventsCommitsColumns.schema)
    return {
        'num_rows': job_results.num_rows,
        'num_columns': len(job_results.schema),
        'table_id': BigQueryPushEventsCommitsColumns.table_id
    }
Exemplo n.º 9
0
def process_github_daily_push_events(day: datetime.datetime):
    push_events_commits = DataLake().landing.get_daily_push_events_commits(
        date=day)
    if push_events_commits is not None and not push_events_commits.empty:
        companies_events = process_push_commits(
            push_events_commits,
            email_field=DataLake().landing.schemas.push_commits.author_email,
            company_field=DataLake().staging.schemas.push_commits.company,
            datetime_field=DataLake(
            ).landing.schemas.push_commits.event_created_at)
        for company, commits in companies_events:
            log.debug(f'Save company {company}')
            DataLake().staging.save_raw_push_events_commits(
                push_event_commits=commits, date=day, company_name=company)
Exemplo n.º 10
0
def generate_email_body(date: datetime, company=Config().default_company):
    report = OSCIChangeRanking(date=date)
    company_contributors_ranking_schema = DataLake(
    ).public.schemas.company_contributors_ranking

    change_ranking = report.read().reset_index()
    change_ranking = change_ranking.rename(
        columns={'index': company_contributors_ranking_schema.position})
    change_ranking[company_contributors_ranking_schema.position] += 1
    change_ranking = __cast_columns_to_int(
        df=change_ranking,
        columns=[
            report.schema.total,
            report.schema.active,
            company_contributors_ranking_schema.position,
            report.schema.total_change,
            report.schema.active_change,
            report.schema.position_change,
        ])
    shift_up = __add_arrows_prefix(df=__get_shift_up(
        change_ranking=change_ranking,
        change_position_field=report.schema.position_change),
                                   column=report.schema.position_change)
    shift_down = __add_arrows_prefix(df=__get_shift_down(
        change_ranking=change_ranking,
        change_position_field=report.schema.position_change),
                                     column=report.schema.position_change)
    company_position = __add_arrows_prefix(
        df=__get_company_neighbors(
            df=change_ranking,
            company=company,
            company_field=report.schema.company,
            rank_field=company_contributors_ranking_schema.position),
        column=report.schema.position_change)
    DataLake().public.save_email(email_body=EmailBodyTemplate().render(
        date=date,
        compared_date=get_previous_date(date),
        shift_up=shift_up,
        shift_down=shift_down,
        company=company,
        company_position=company_position,
        solutionshub_osci_change_ranking=OSCIChangeRankingExcel(
            to_date=date).url,
        osci_reports_urls={
            name: report_cls(date=date).url
            for name, report_cls in OSCI_REPORTS_URLS.items()
        }),
                                 date=date)
def load_companies_contrib_repos_to_bq(date: datetime.datetime) -> bigquery.table.Table:
    """Load companies contributors repositories to BigQuery for a given day"""
    df = CompaniesContributorsRepository(date).read().rename(
        columns=BigQueryCompaniesContributorsRepositoriesCommitsColumns.mapping)
    return DataLake().big_query.load_dataframe(df=df,
                                               table_id=BigQueryCompaniesContributorsRepositoriesCommitsColumns.table_id,
                                               schema=BigQueryCompaniesContributorsRepositoriesCommitsColumns.schema)
Exemplo n.º 12
0
class Repositories:
    schema = DataLake().staging.schemas.repositories

    def __init__(self, date: datetime.datetime):
        self.date = date

    @property
    def path(self) -> str:
        """
        Return the full path to repositories
        """
        return DataLake().staging.get_repositories_path(self.date)

    @property
    def spark_path(self) -> str:
        """
        Return the full path to repositories
        """
        return DataLake().staging.get_repositories_spark_path(self.date)

    def save(self, df: pd.DataFrame):
        """
        Save pandas DataFrame as file
        :param df:
        """
        return DataLake().staging.save_repositories(df, self.date)

    def read(self) -> pd.DataFrame:
        """
        Read repositories to pandas DataFrame from file
        """
        return DataLake().staging.get_repositories(self.date)
Exemplo n.º 13
0
class CompaniesContributorsRepository:
    schema = DataLake().public.schemas.company_contributors_repository_commits

    def __init__(self, date: datetime.datetime):
        self.date = date

    @property
    def path(self) -> str:
        """
        Return the full path to company contributors repository commits
        """
        return DataLake().public.get_companies_contributors_repository_commits_path(self.date)

    @property
    def spark_path(self) -> str:
        """
        Return the full path to company contributors repository commits
        """
        return DataLake().public.get_companies_contributors_repository_commits_spark_path(self.date)

    def save(self, df: pd.DataFrame):
        """
        Save pandas DataFrame as file
        :param df:
        """
        return DataLake().public.save_companies_contributors_repository_commits(df, self.date)

    def read(self) -> pd.DataFrame:
        """
        Read company contributors repository commits to pandas DataFrame from file
        """
        return DataLake().public.get_companies_contributors_repository_commits(self.date)
Exemplo n.º 14
0
def get_github_daily_push_events(day: datetime.datetime):
    with GithubArchiveRest() as rest:
        for hour in range(24):
            log.info(f'Crawl events for {day}')
            day = day.replace(hour=hour)
            push_events_commits = get_hour_push_events_commits(day=day,
                                                               rest=rest)
            DataLake().landing.save_push_events_commits(
                push_event_commits=push_events_commits, date=day)
Exemplo n.º 15
0
def raw_push_events_commit_df():
    return pd.DataFrame([
        {
            DataLake().staging.schemas.push_commits.event_id: "1111111",
            DataLake().staging.schemas.push_commits.event_created_at:
            "2021-01-01 00:15:22+00:00",
            DataLake().staging.schemas.push_commits.actor_login: "******",
            DataLake().staging.schemas.push_commits.repo_name: "epam/OSCI",
            DataLake().staging.schemas.push_commits.org_name: 'EPAM',
            DataLake().staging.schemas.push_commits.sha:
            "aaa656b0d05ec5b8ed5beb2f94c4aa11ea111a1a",
            DataLake().staging.schemas.push_commits.author_name: "User Name",
            DataLake().staging.schemas.push_commits.author_email:
            "*****@*****.**",
            DataLake().staging.schemas.push_commits.company: "EPAM"
        },
    ])
Exemplo n.º 16
0
def load_osci_general_reports_to_bq(date: datetime.datetime):
    report = OSCIGeneralRanking(date=date)
    table = BigQueryOSCIGeneralRankingReport
    log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}')
    report_df = report.read()
    report_df = report_df[PublicSchemas.osci_general_report.required]
    report_df[table.Columns.position] += 1
    report_df = report_df.rename(columns=table.mapping)
    report_df[table.Columns.date] = date
    return DataLake().big_query.load_dataframe(df=report_df,
                                               table_id=table.table_id,
                                               schema=table.schema)
Exemplo n.º 17
0
def load_company_repositories_events_commits(date: datetime, company: str):
    events = DataLake().staging.get_push_events_commits(
        company=company,
        from_date=date,
        to_date=date,
        date_period_type=DatePeriodType.DTD)
    schema = DataLake().staging.schemas.push_commits
    if events.empty:
        log.warning(f'No {company} events at {date}')
        return
    with GithubRest(token=Config().github_token) as rest:
        company_commits = get_company_repositories_events_commits(
            repositories_names=events[schema.repo_name].unique(),
            date=date,
            company=company,
            rest=rest)
        company_commits_df = pd.DataFrame(company_commits)
        DataLake().staging.save_private_push_events_commits(
            push_event_commits=company_commits_df,
            company_name=company,
            date=date)
Exemplo n.º 18
0
def get_contributors_repositories_change(date: datetime, company: str):
    ranking = ContributorsReposYTD(date=date, company=company)
    ranking_df = ranking.read()
    compared_ranking = ContributorsReposYTD(date=get_previous_date(date),
                                            company=company)
    compared_ranking_df = compared_ranking.read()

    new_contributors = NewContributors(date=date, company=company)

    new_contributors_df = pd.DataFrame(
        data=set(ranking_df[ranking.schema.author]) -
        set(compared_ranking_df[ranking.schema.author]),
        columns=[DataLake().public.schemas.new_contributors.author])
    new_contributors.save(df=new_contributors_df)

    new_repos = NewRepos(date=date, company=company)

    new_repos_df = pd.DataFrame(
        data=set(ranking_df[ranking.schema.repo]) -
        set(compared_ranking_df[ranking.schema.repo]),
        columns=[DataLake().public.schemas.new_repos.repo])
    new_repos.save(df=new_repos_df)
Exemplo n.º 19
0
def no_match_license_raw_push_event_commit_df():
    return pd.DataFrame([{
        DataLake().staging.schemas.push_commits.event_id:
        "222222",
        DataLake().staging.schemas.push_commits.event_created_at:
        "2021-01-01 00:15:22+00:00",
        DataLake().staging.schemas.push_commits.actor_login:
        "******",
        DataLake().staging.schemas.push_commits.repo_name:
        "test/TEST",
        DataLake().staging.schemas.push_commits.org_name:
        None,
        DataLake().staging.schemas.push_commits.sha:
        "bbb656b0d05ec5b8ed5beb2f94c4aa11ea111a1a",
        DataLake().staging.schemas.push_commits.author_name:
        "User Name",
        DataLake().staging.schemas.push_commits.author_email:
        "*****@*****.**",
        DataLake().staging.schemas.push_commits.company:
        "EPAM"
    }])
Exemplo n.º 20
0
def load_osci_daily_ranking_to_bq(date: datetime.datetime):
    """Load Daily Change ranking to Big Query"""
    report = OSCIChangeRankingDTD(date=date)
    table = BigQueryOSCIDailyChangeRankingReport

    log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}')

    report_df = report.read()
    report_df = report_df.reset_index().rename(
        columns={'index': table.Columns.position})
    report_df = report_df[PublicSchemas.osci_ranking_schema.required]
    report_df[table.Columns.position] += 1
    report_df = report_df.rename(columns=table.mapping)
    report_df[table.Columns.date] = date.date()

    return DataLake().big_query.load_dataframe(df=report_df,
                                               table_id=table.table_id,
                                               schema=table.schema)
Exemplo n.º 21
0
def load_osci_ranking_to_bq(date: datetime.datetime,
                            date_period: str = DatePeriodType.YTD):
    if date_period not in (DatePeriodType.MTD, DatePeriodType.YTD):
        raise ValueError(f'Unsupported {date_period}')
    report = OSCIRankingFactory().get_cls(date_period=date_period)(date=date)
    table = date_period_to_table_map[date_period]

    log.debug(f'Load {report.name} for {date:%Y-%m-%d} to {table.table_id}')

    report_df = report.read()
    report_df = report_df[PublicSchemas.company_contributors_ranking.required]
    report_df = report_df.reset_index().rename(
        columns={'index': table.Columns.position})
    report_df[table.Columns.position] += 1
    report_df = report_df.rename(columns=table.mapping)
    report_df[table.Columns.date] = date.date()

    return DataLake().big_query.load_dataframe(df=report_df,
                                               table_id=table.table_id,
                                               schema=table.schema)
Exemplo n.º 22
0
 def read(self) -> pd.DataFrame:
     """
     Read company contributors repository commits to pandas DataFrame from file
     """
     return DataLake().public.get_companies_contributors_repository_commits(self.date)
Exemplo n.º 23
0
 def save(self, df: pd.DataFrame):
     """
     Save pandas DataFrame as file
     :param df:
     """
     return DataLake().public.save_companies_contributors_repository_commits(df, self.date)
Exemplo n.º 24
0
 def spark_path(self) -> str:
     """
     Return the full path to company contributors repository commits
     """
     return DataLake().public.get_companies_contributors_repository_commits_spark_path(self.date)
Exemplo n.º 25
0
 def url(self) -> str:
     return DataLake().public.get_report_url(report_name=self.name,
                                             date=self.date)
def transfer_monthly_change_ranking(date: datetime) -> dict:
    web_ranking = generate_web_osci_change_ranking(date)
    DataLake().web.save_monthly_osci_ranking(ranking=web_ranking, date=date)
    return web_ranking
Exemplo n.º 27
0
 def read(self) -> pd.DataFrame:
     return DataLake().public.get_report(report_name=self.name,
                                         date=self.date)
Exemplo n.º 28
0
 def path(self) -> str:
     return DataLake().public.get_report_path(report_name=self.name,
                                              date=self.date)
Exemplo n.º 29
0
 def save(self, df: pd.DataFrame):
     DataLake().public.save_report(report_df=df,
                                   report_name=self.name,
                                   date=self.date)
Exemplo n.º 30
0
def adjunct_columns():
    return [
        DataLake().staging.schemas.repositories.name,
        DataLake().staging.schemas.repositories.language,
        DataLake().staging.schemas.repositories.license
    ]