예제 #1
0
 def __init__(self, config, db_session=None, fill_date=None):
     self.config = read_config_file(config, __file__)
     self.config_generation = self.config['generation']
     self.db_session = db_session if db_session else session_factory()
     if fill_date is None:
         self.curr_fill, self.curr_fill_date = get_latest_fill_id(
             self.db_session)
     else:
         fill_dt = make_dump_date_from_str(fill_date)
         self.curr_fill, self.curr_fill_date = get_exact_fill_id(
             self.db_session, fill_dt)
     self.metric_combinations = None
     self.metric_creator = None
     self.metric_job = None
     self.pid = os.getpid()
예제 #2
0
 def __init__(self, config):
     self.config = read_config_file(config, __file__)
     self.config_insertion = self.config['insertion']
     self.overwrite = self.config_insertion[
         'overwrite'] if 'overwrite' in self.config_insertion else False
     self.frontfill_backfill = os.getenv("HUMANIKI_BACKFILL", "front")
     self.db_session = db.session_factory()
     self.working_fill_date = None
     self.humaniki_override_date = os.getenv("HUMANIKI_OVERRIDE_DATE", None)
     if self.humaniki_override_date is not None:
         self.working_fill_date = make_dump_date_from_str(
             self.humaniki_override_date)
     self.metrics_factory = None
     self.num_procs = os.getenv("HUMANIKI_NUM_PROCS", 4)
     self.fill_id = None
     self.dry_run = bool(int(os.getenv('HUMANIKI_DRY_RUN', '0')))
     log.info("Humaniki Orchestrator intialized")
예제 #3
0
 def _create_metric_creators(self):
     if self.metric_job is not None:
         mc = MetricCreator(
             population_definition=PopulationDefinition(
                 self.metric_job.detail["population_definition"]),
             bias_property=Properties(
                 self.metric_job.detail['bias_property']),
             dimension_properties=[
                 Properties(d)
                 for d in self.metric_job.detail['dimension_properties']
             ],
             threshold=self.metric_job.detail['threshold'],
             fill_id=self.metric_job.fill_id,
             properties_id=self.metric_job.detail['properties_id'],
             db_session=session_factory())
         self.metric_creator = mc
         log.info(f"hydrate metric creator")
     else:
         log.info(f'PID:{self.pid} No metrics creator to hydrate')
         sys.exit(29)  # special signal to calling bash.
예제 #4
0
 def __init__(self, config, dump_date=None, dump_subset=None, insert_strategy=None):
     self.config = hs_utils.read_config_file(config, __file__)
     self.config_insertion = self.config['insertion']
     self.overwrite = self.config_insertion['overwrite'] if 'overwrite' in self.config_insertion else False
     self.only_files = self.config_insertion['only_files'] if 'only_files' in self.config_insertion else None
     self.insert_strategy = insert_strategy if insert_strategy is not None else "infile"
     self.dump_date = hs_utils.make_dump_date_from_str(dump_date) if dump_date else None
     self.dump_subset = dump_subset
     self.dump_date_str = None
     self.fill_id = None
     self.detection_type = None
     self.db_session = session_factory()
     # order is important becuse of foreign key constraint
     self.csvs = None
     self.CSV_NA_VALUE = r'\N'
     self.table_column_map = {
         'human':
             {"insert_columns": ['qid', 'gender', 'year_of_birth', 'sitelink_count'],
              "extra_const_columns": {},
              "escaping_options": ""},
         'human_country':
             {"insert_columns": ['human_id', 'country'],
              "extra_const_columns": {},
              "escaping_options": ""},
         'human_occupation':
             {"insert_columns": ['human_id', 'occupation'],
              "extra_const_columns": {},
              "escaping_options": ""},
         'human_sitelink':
             {"insert_columns": ['human_id', 'sitelink'],
              "extra_const_columns": {},
              "escaping_options": ""},
         'label':
             {"insert_columns": ['qid', 'label'],
              "extra_const_columns": {'lang': 'en'},
              "escaping_options": """OPTIONALLY ENCLOSED BY '"' ESCAPED BY '\\\\'"""},
         'occupation_parent':
             {"insert_columns": ['occupation', 'parent'],
              "extra_const_columns": {},
              "escaping_options": ""},
     }
from humaniki_backend import app
from humaniki_schema.schema import metric
from humaniki_schema.utils import read_config_file
from unittest import TestCase
tc = TestCase()

config = read_config_file(os.environ['HUMANIKI_YAML_CONFIG'], __file__)

# TODO. If you generate the data seperately and then run the tests they pass. But if you ask the data
# to be generated here, sometimes there are no metrics created, despite, metrics count showing nonzero.
# a mystery.
skip_generation = config['test']['skip_gen'] if 'skip_gen' in config['test'] else False
if not skip_generation:
    generated = generate_example_data.generate_all(config=config)
    print(f'generated: {generated}')
    session = db.session_factory()
    metrics_count = session.query(func.count(metric.fill_id)).scalar()
    print(f'number of metrics: {metrics_count}')
    assert metrics_count>0

@pytest.fixture
def test_jsons():
    test_files = {}
    test_datadir = config['test']['test_datadir']
    files = os.listdir(test_datadir)
    json_fs = [f for f in files if f.endswith('.json')]
    for json_f in json_fs:
        j = json.load(open(os.path.join(test_datadir, json_f)))
        test_files[json_f] = j
    return test_files
예제 #6
0
import sqlalchemy

from sqlalchemy import func, and_, or_, cast, String
from sqlalchemy.dialects.mysql import JSON
from sqlalchemy.orm import aliased
from sqlalchemy.orm.attributes import flag_modified

from humaniki_schema import utils as hs_utils
from humaniki_schema.schema import fill, metric_properties_j, metric_properties_n, metric_aggregations_j, \
    metric_aggregations_n, \
    project, metric
import humaniki_schema.utils as hs_utils

from humaniki_schema.db import session_factory

db_session = session_factory()


def get_properties_obj(bias_property,
                       dimension_properties,
                       session=None,
                       table=metric_properties_j,
                       as_subquery=False,
                       create_if_no_exist=False):
    """
    the main entry point, get a property id, by the bias property (like gender) and the other dimensions
    the order we are storing the properties, either in json or normalized is
    0: bias_value, 1..n: properties sorted by their PID (and sitelink is faked as PID 0).

    overview
    get_property