예제 #1
0
    def resolve_function(self, key, kwargs, previous_functions):
        """
        Determines which function is to be run. Function name is generally the
        key, but if a function_name parameter is passed this is used instead
        (useful if you want to call the same function more than once).
        """

        if 'function_name' in kwargs:
            qual_function_name = kwargs['function_name']
        else:
            qual_function_name = key

        if "." in qual_function_name:
            module_name, function_name = qual_function_name.split(".")
        else:
            module_name, function_name = [None, qual_function_name]

        # get function object from function name
        fn = self.get_fn_object(module_name, function_name)
        if fn is None:
            errmsg_raw = "couldn't find a function %s in modules %s"
            errmsg = errmsg_raw % (function_name, ", ".join(
                str(m) for m in self.analytics_modules))
            raise Exception(errmsg)
        self.logger.info("matched function %s to fn %s" %
                         (qual_function_name, str(fn)))

        return AnalyticsFunction(fn,
                                 kwargs,
                                 previous_functions=previous_functions,
                                 storages=self.storages,
                                 cachePath=self.cachePath,
                                 constants=self.config.get('constants', None),
                                 key=key)
def get_institution_data(af: AnalyticsFunction,
                         focus_year=FOCUS_YEAR,
                         year_range=YEAR_RANGE,
                         project_id=PROJECT_ID,
                         data: str = 'current'):
    if data == 'current':
        tables = 'coki_dashboards.institution'
    elif data == 'local':
        raise NotImplementedError
    else:  # In this case data must be the path to the relevant BQ table eg observatory.institution20210403
        tables = data

    bq_table = f'academic-observatory.{tables}'
    scope = """AND
country_code in ("GBR", "USA", "NLD")
"""
    args = [credentials, project_id, scope, focus_year, year_range, bq_table]
    inst_table = GenericPublishersTable(*args)
    c = inst_table.df
    inst_oa = GenericOpenAccessTable(*args)
    oa = inst_oa.df

    scope = f"""AND
(country_code in ("GBR", "USA", "NLD"))
AND (discipline.field in ("{'", "'.join(disciplines)}"))
    """

    args = [credentials, project_id, scope, focus_year, year_range, bq_table]
    cd_table = GenericDisciplinesTable(*args)
    cd = cd_table.df

    for discipline in disciplines:
        c = c.merge(cd[cd.field == discipline][[
            'id', 'published_year', 'count', 'oa', 'gold', 'green'
        ]],
                    on=['id', 'published_year'],
                    suffixes=(None, f'_{discipline.lower().split(" ")[0]}'))

    c = c.merge(oa[[
        'id', 'published_year', 'oa', 'green', 'gold', 'hybrid', 'gold_doaj'
    ]],
                on=['id', 'published_year'],
                suffixes=(None, '_institution'))

    with pd.HDFStore(CACHE_FILENAME) as store:
        store['institutions'] = c
    af.add_existing_file(CACHE_FILENAME, remove=True)
def write_plotly_div(af: AnalyticsFunction,
                     figure: go.Figure,
                     filename: Union[str, Path],
                     full_html: Optional[bool] = True,
                     include_plotlyjs: Optional[Union[str, bool]] = True,
                     auto_play: Optional[bool] = False):
    h = figure.to_html(filename,
                       full_html=full_html,
                       include_plotlyjs=include_plotlyjs,
                       auto_play=auto_play)

    for f in af.generate_file(filename):
        f.write(h)
def process_institution_data(af: AnalyticsFunction):
    c = load_cache_data(af, get_institution_data, 'institutions')
    chart_utils.calculate_percentages(
        c,
        numer_columns=['count', 'oa', 'gold', 'green'] +
        [f'count_{d}' for d in disc],
        denom_column='total_outputs',
        column_name_add='pc_of_total_')

    chart_utils.calculate_percentages(c,
                                      numer_columns=['gold'] +
                                      [f'gold_{d}' for d in disc],
                                      denom_column='gold_institution',
                                      column_name_add='pc_of_gold_')
    for d in disc:
        chart_utils.calculate_percentages(
            c,
            numer_columns=['count', 'oa', 'gold', 'green'],
            denom_column=f'count_{d}',
            column_name_add=f'pc_of_{d}_')
        chart_utils.calculate_percentages(c,
                                          numer_columns=['gold'],
                                          denom_column=f'gold_{d}',
                                          column_name_add=f'pc_of_gold_{d}_')

        chart_utils.calculate_percentages(
            c,
            numer_columns=[f'gold_{d}', f'green_{d}'],
            denom_column=f'count_{d}',
            column_name_add=f'pc_of_{d}_')

    with pd.HDFStore(CACHE_FILENAME) as store:
        store['institutions'] = c
    af.add_existing_file(CACHE_FILENAME, remove=True)

    c.to_csv('institutions.csv')
    af.add_existing_file('institutions.csv', remove=True)
def load_cache_data(af: AnalyticsFunction, function_name: Union[str, Callable],
                    element: str):
    """Convenience function for loading preprepared DataFrames from the cache

    :param function_name:
    :param element: Component of the filecache to load
    :param af

    Downloaded query data is collected as DataFrames and stored in and HDFS store as DataFrames. This
    is a convenient function for reloading data from that frame. TODO The contents of the store should
    also be collected in a defined metadata element stored in the Analytics Function.
    """

    if callable(function_name):
        afunction_name = function_name.__name__
    else:
        afunction_name = function_name
    store_filepath = af.path_to_cached_file(CACHE_FILENAME, afunction_name)

    with pd.HDFStore(store_filepath) as store:
        df = store[element]

    return df
from pathlib import Path
from precipy.analytics_function import AnalyticsFunction
from precipy.batch import Batch
from precipy.storage import GoogleCloudStorage
import os


def bar(af):
    with open("hello.txt", 'w') as f:
        f.write("hello!")
    af.add_existing_file("hello.txt")
    os.remove("hello.txt")


af = AnalyticsFunction(bar, {})

storage = GoogleCloudStorage()
batch = Batch({
    'storages': [storage],
    'cache_bucket_name': 'precipy_testing_cache',
    'output_bucket_name': 'precipy_testing_output'
})


def test_connect():
    storage.init(batch)
    storage.connect()
    assert str(
        storage.cache_storage_bucket) == "<Bucket: precipy_testing_cache>"
    assert str(
        storage.output_storage_bucket) == "<Bucket: precipy_testing_output>"
예제 #7
0
def bar(af):
    with open("hello.txt", 'w') as f:
        f.write("hello!")
    af.add_existing_file("hello.txt")
    os.remove("hello.txt")


def baz(af):
    for f in af.generate_file("hi.txt"):
        f.write("hi!")

    for f in af.generate_file("hola.txt"):
        f.write("hola!")


af = AnalyticsFunction(foo, {})
bf = AnalyticsFunction(bar, {})
cf = AnalyticsFunction(baz, {}, previous_functions={"bar": bf.h})


def test_function_name():
    assert af.function_name == "foo"
    assert bf.function_name == "bar"
    assert cf.function_name == "baz"


def test_function_source():
    assert "def foo" in af.function_source
    assert "def bar" in bf.function_source
    assert "def baz" in cf.function_source