Exemplo n.º 1
0
    def frontfill_determine_needs_run_from_remote_fill_dt(self):
        """return true if remote date is newer than what we have"""
        if self.working_fill_date is None:
            # that is it wasn't set yet, maybe by override
            try:
                latest_local_fill_id, latest_local_fill_date = get_latest_fill_id(
                    self.db_session)
            except sqlalchemy.orm.exc.NoResultFound:
                # in the case this is the very first run
                latest_local_fill_date = datetime(
                    2012, 1, 1).date()  # when wikidata first started.

            wd_dir_raw = os.listdir(os.environ['HUMANIKI_DUMP_DIR'])
            # filter out broken links
            wd_dir_ls = [
                os.path.join(os.environ['HUMANIKI_DUMP_DIR'], p)
                for p in wd_dir_raw
            ]
            wd_dir_ls_links = [
                os.path.join(os.environ['HUMANIKI_DUMP_DIR'], os.readlink(l))
                for l in wd_dir_ls
            ]
            wd_dir_ls_exists = [
                f for f in wd_dir_ls_links if os.path.exists(f)
            ]
            # make sure the file is like YYYYMMDD.json.gz
            wd_dir_ls_exists_correct = [
                f for f in wd_dir_ls_exists
                if is_wikimedia_cloud_dump_format(f)
            ]
            log.info(
                f'Existing and correct dump files found were {wd_dir_ls_exists_correct}'
            )
            wd_dir_dts = [
                make_dump_date_from_str(
                    numeric_part_of_filename(dt_s, basenameittoo=True))
                for dt_s in wd_dir_ls_exists_correct
            ]
            remote_later_than_local = [
                fd for fd in wd_dir_dts if fd > latest_local_fill_date
            ]

            if remote_later_than_local:
                log.info(
                    f"Lastest local was {latest_local_fill_date}, and {len(remote_later_than_local)} remote dts later"
                )
                remote_infimum_date = min(remote_later_than_local)
                self.working_fill_date = remote_infimum_date
            # select the remote fill date that's earliest but still greater than local
            else:
                log.info(
                    f"Lastest local was {latest_local_fill_date}, and nothing later from {len(wd_dir_dts)} remote dts"
                )
Exemplo n.º 2
0
 def __init__(self, config, db_session=None, fill_date=None):
     self.config = read_config_file(config, __file__)
     self.config_generation = self.config['generation']
     self.db_session = db_session if db_session else session_factory()
     if fill_date is None:
         self.curr_fill, self.curr_fill_date = get_latest_fill_id(
             self.db_session)
     else:
         fill_dt = make_dump_date_from_str(fill_date)
         self.curr_fill, self.curr_fill_date = get_exact_fill_id(
             self.db_session, fill_dt)
     self.metric_combinations = None
     self.metric_creator = None
     self.metric_job = None
     self.pid = os.getpid()
def insert_or_skip(config, session):
    skip_insert = config['test']['skip_insert'] if 'skip_insert' in config[
        'test'] else False
    if not skip_insert:
        data_dir = config['generation']['example']['datadir']
        num_fills = config['generation']['example']['fills']
        example_len = config['generation']['example']['len']
        curr_fill_id = insert_data(data_dir=data_dir,
                                   num_fills=num_fills,
                                   example_len=example_len)
        metrics_count = session.query(func.count(metric.fill_id)).scalar()
        print(f'number of metrics: {metrics_count}')
        # we want no metrics, a clean slate if we are inserting
        assert metrics_count == 0
    else:
        # we'll still need the curr_fill otherwise
        curr_fill_id, curr_fill_dt = get_latest_fill_id(session)
    return curr_fill_id
Exemplo n.º 4
0
def gap(bias, snapshot, population):
    latest_fill_id, latest_fill_date = get_latest_fill_id(session)
    return_warnings = {}
    errors = {}
    query_params = request.values

    # If a client explicitly asks an error to be sent back.
    if "error_test" in query_params.keys():
        errors['test'] = repr(ValueError('sending you back an a value error'))
        errors['test_another'] = repr(
            ValueError('simulating what mutliple errors would look like'))
        return jsonify(errors=errors)

    try:
        # TODO include validating bias
        valid_request = assert_gap_request_valid(snapshot, population,
                                                 query_params)
    except AssertionError as ae:
        errors['validation'] = repr(ae)
        #in this case fail immediately
        return jsonify(errors=errors)
    # handle snapshot
    requested_fill_id, requested_fill_date, snapshot_corrected = determine_fill_id(
        session, snapshot, latest_fill_id, latest_fill_date)
    # print(f"Fills {requested_fill_id} {requested_fill_date}")
    if snapshot_corrected:
        return_warnings['snapshot_corrected to'] = requested_fill_date
    # handle populations
    population_id, population_name, population_corrected = determine_population_conflict(
        population, query_params)
    if population_corrected:
        return_warnings['population_corrected to'] = population_name
    # order query params by property pid
    ordered_query_params, non_orderable_query_params = order_query_params(
        query_params)
    # get properties-id
    try:
        bias_property = get_pid_from_str(bias)
        ordered_properties = ordered_query_params.keys()
        properties_id = get_properties_obj(
            session=session,
            dimension_properties=ordered_properties,
            bias_property=bias_property)
        # properties_id = get_properties_id(session, ordered_properties, bias_property=bias_property)
    except ValueError as ve:
        errors['properties_id'] = repr(ve)
        log.exception(errors)

    # get coverage
    coverage = get_coverage(session=session,
                            population_id=population_id,
                            properties_id=properties_id.id,
                            fill_id=requested_fill_id)

    # get aggregations-id
    try:
        aggregations_id_preds = get_aggregations_id_preds(
            session,
            ordered_query_params,
            non_orderable_query_params,
            as_subquery=True)
    except ValueError as ve:
        errors['aggregations_id_preds'] = repr(ve)
        log.exception(errors)
    # get metric
    try:
        # default the label lang to 'en' if not set
        label_lang = non_orderable_query_params[
            'label_lang'] if 'label_lang' in non_orderable_query_params else None
        metrics, represented_biases = build_metrics(
            session,
            fill_id=requested_fill_id,
            population_id=population_id,
            properties_id=properties_id,
            aggregations_id=aggregations_id_preds,
            label_lang=label_lang)
    except ValueError as ve:
        errors['metrics'] = repr(ve)

    # there are errors return those.
    if errors:
        return jsonify(errors=errors)

    meta = {
        'snapshot':
        str(requested_fill_date),
        'population':
        population_name,
        'population_corrected':
        population_corrected,
        'label_lang':
        label_lang,
        'bias':
        bias,
        'bias_property':
        bias_property,
        'aggregation_properties':
        [Properties(p).name for p in properties_id.properties],
        'coverage':
        coverage,
    }
    if represented_biases:
        meta['bias_labels'] = represented_biases
    full_response = {'meta': meta, 'metrics': metrics}
    return jsonify(**full_response)
Exemplo n.º 5
0
    get_metrics_count, get_all_snapshot_dates, get_coverage
from humaniki_backend.utils import determine_population_conflict, assert_gap_request_valid, \
    order_query_params, get_pid_from_str, determine_fill_id, is_property_exclusively_citizenship
from humaniki_schema.queries import get_properties_obj, get_latest_fill_id
from humaniki_schema.utils import Properties, make_fill_dt
from humaniki_schema.log import get_logger

log = get_logger(BASE_DIR=__file__)

app = Flask(__name__)
CORS(app)
session = flask_scoped_session(session_factory, app)

# Note this requires updating or the process restarting after a new fill.
# TODO: have this be a function that can be called and updated.
latest_fill_id, latest_fill_date = get_latest_fill_id(session)
app.latest_fill_id = latest_fill_id


@app.route("/")
def home():
    log.info('home route called')
    return jsonify(latest_fill_id, latest_fill_date)


@app.route("/v1/available_snapshots/")
def available_snapshots():
    all_snaphot_dates = get_all_snapshot_dates(session)
    return jsonify(all_snaphot_dates)