def resolve_function(self, key, kwargs, previous_functions): """ Determines which function is to be run. Function name is generally the key, but if a function_name parameter is passed this is used instead (useful if you want to call the same function more than once). """ if 'function_name' in kwargs: qual_function_name = kwargs['function_name'] else: qual_function_name = key if "." in qual_function_name: module_name, function_name = qual_function_name.split(".") else: module_name, function_name = [None, qual_function_name] # get function object from function name fn = self.get_fn_object(module_name, function_name) if fn is None: errmsg_raw = "couldn't find a function %s in modules %s" errmsg = errmsg_raw % (function_name, ", ".join( str(m) for m in self.analytics_modules)) raise Exception(errmsg) self.logger.info("matched function %s to fn %s" % (qual_function_name, str(fn))) return AnalyticsFunction(fn, kwargs, previous_functions=previous_functions, storages=self.storages, cachePath=self.cachePath, constants=self.config.get('constants', None), key=key)
def get_institution_data(af: AnalyticsFunction, focus_year=FOCUS_YEAR, year_range=YEAR_RANGE, project_id=PROJECT_ID, data: str = 'current'): if data == 'current': tables = 'coki_dashboards.institution' elif data == 'local': raise NotImplementedError else: # In this case data must be the path to the relevant BQ table eg observatory.institution20210403 tables = data bq_table = f'academic-observatory.{tables}' scope = """AND country_code in ("GBR", "USA", "NLD") """ args = [credentials, project_id, scope, focus_year, year_range, bq_table] inst_table = GenericPublishersTable(*args) c = inst_table.df inst_oa = GenericOpenAccessTable(*args) oa = inst_oa.df scope = f"""AND (country_code in ("GBR", "USA", "NLD")) AND (discipline.field in ("{'", "'.join(disciplines)}")) """ args = [credentials, project_id, scope, focus_year, year_range, bq_table] cd_table = GenericDisciplinesTable(*args) cd = cd_table.df for discipline in disciplines: c = c.merge(cd[cd.field == discipline][[ 'id', 'published_year', 'count', 'oa', 'gold', 'green' ]], on=['id', 'published_year'], suffixes=(None, f'_{discipline.lower().split(" ")[0]}')) c = c.merge(oa[[ 'id', 'published_year', 'oa', 'green', 'gold', 'hybrid', 'gold_doaj' ]], on=['id', 'published_year'], suffixes=(None, '_institution')) with pd.HDFStore(CACHE_FILENAME) as store: store['institutions'] = c af.add_existing_file(CACHE_FILENAME, remove=True)
def write_plotly_div(af: AnalyticsFunction, figure: go.Figure, filename: Union[str, Path], full_html: Optional[bool] = True, include_plotlyjs: Optional[Union[str, bool]] = True, auto_play: Optional[bool] = False): h = figure.to_html(filename, full_html=full_html, include_plotlyjs=include_plotlyjs, auto_play=auto_play) for f in af.generate_file(filename): f.write(h)
def process_institution_data(af: AnalyticsFunction): c = load_cache_data(af, get_institution_data, 'institutions') chart_utils.calculate_percentages( c, numer_columns=['count', 'oa', 'gold', 'green'] + [f'count_{d}' for d in disc], denom_column='total_outputs', column_name_add='pc_of_total_') chart_utils.calculate_percentages(c, numer_columns=['gold'] + [f'gold_{d}' for d in disc], denom_column='gold_institution', column_name_add='pc_of_gold_') for d in disc: chart_utils.calculate_percentages( c, numer_columns=['count', 'oa', 'gold', 'green'], denom_column=f'count_{d}', column_name_add=f'pc_of_{d}_') chart_utils.calculate_percentages(c, numer_columns=['gold'], denom_column=f'gold_{d}', column_name_add=f'pc_of_gold_{d}_') chart_utils.calculate_percentages( c, numer_columns=[f'gold_{d}', f'green_{d}'], denom_column=f'count_{d}', column_name_add=f'pc_of_{d}_') with pd.HDFStore(CACHE_FILENAME) as store: store['institutions'] = c af.add_existing_file(CACHE_FILENAME, remove=True) c.to_csv('institutions.csv') af.add_existing_file('institutions.csv', remove=True)
def load_cache_data(af: AnalyticsFunction, function_name: Union[str, Callable], element: str): """Convenience function for loading preprepared DataFrames from the cache :param function_name: :param element: Component of the filecache to load :param af Downloaded query data is collected as DataFrames and stored in and HDFS store as DataFrames. This is a convenient function for reloading data from that frame. TODO The contents of the store should also be collected in a defined metadata element stored in the Analytics Function. """ if callable(function_name): afunction_name = function_name.__name__ else: afunction_name = function_name store_filepath = af.path_to_cached_file(CACHE_FILENAME, afunction_name) with pd.HDFStore(store_filepath) as store: df = store[element] return df
from pathlib import Path from precipy.analytics_function import AnalyticsFunction from precipy.batch import Batch from precipy.storage import GoogleCloudStorage import os def bar(af): with open("hello.txt", 'w') as f: f.write("hello!") af.add_existing_file("hello.txt") os.remove("hello.txt") af = AnalyticsFunction(bar, {}) storage = GoogleCloudStorage() batch = Batch({ 'storages': [storage], 'cache_bucket_name': 'precipy_testing_cache', 'output_bucket_name': 'precipy_testing_output' }) def test_connect(): storage.init(batch) storage.connect() assert str( storage.cache_storage_bucket) == "<Bucket: precipy_testing_cache>" assert str( storage.output_storage_bucket) == "<Bucket: precipy_testing_output>"
def bar(af): with open("hello.txt", 'w') as f: f.write("hello!") af.add_existing_file("hello.txt") os.remove("hello.txt") def baz(af): for f in af.generate_file("hi.txt"): f.write("hi!") for f in af.generate_file("hola.txt"): f.write("hola!") af = AnalyticsFunction(foo, {}) bf = AnalyticsFunction(bar, {}) cf = AnalyticsFunction(baz, {}, previous_functions={"bar": bf.h}) def test_function_name(): assert af.function_name == "foo" assert bf.function_name == "bar" assert cf.function_name == "baz" def test_function_source(): assert "def foo" in af.function_source assert "def bar" in bf.function_source assert "def baz" in cf.function_source