예제 #1
0
def prerun_check(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Sanity check that we need to compute run")

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.debug("Project not found. Skipping")
            raise ProjectDeleted(project_id)

        res = get_run(conn, run_id)
        if res is None:
            log.debug(f"Run not found. Skipping")
            raise RunDeleted(run_id)

        try:
            state = get_run_state_for_update(conn, run_id)
        except psycopg2.OperationalError:
            log.warning("Run started in another task. Skipping this race.")
            return

        if state in {'running', 'completed', 'error'}:
            log.warning("Run already started. Skipping")
            return

        log.debug("Setting run as in progress")
        update_run_set_started(conn, run_id)

        log.debug("Getting dp ids for compute similarity task")
        dp_ids = get_dataprovider_ids(conn, project_id)
        log.debug("Data providers: {}".format(dp_ids))

    create_comparison_jobs.delay(project_id, run_id,
                                 prerun_check.get_serialized_span())
    log.info("CLK similarity computation scheduled")
예제 #2
0
def _insert_similarity_into_db(db, log, run_id, merged_filename):
    try:
        result_id = insert_similarity_score_file(db, run_id, merged_filename)
    except psycopg2.IntegrityError:
        log.info("Error saving similarity score filename to database. "
                 "The project may have been deleted.")
        raise RunDeleted(run_id)
    log.debug(f"Saved path to similarity scores file to db with id "
              f"{result_id}")
예제 #3
0
def insert_permutation(conn, dp_id, run_id, perm_list):
    sql_insertion_query = """
        INSERT INTO permutations
          (dp, run, permutation)
        VALUES
          (%s, %s, %s)
        """
    try:
        with conn.cursor() as cur:
            cur.execute(sql_insertion_query, [dp_id, run_id, psycopg2.extras.Json(perm_list)])
    except psycopg2.IntegrityError:
        raise RunDeleted(run_id)
예제 #4
0
def progress_run_stage(db, run_id):
    try:
        with db.cursor() as cur:
            sql_query = """
                UPDATE runs SET
                  stage = stage + 1
                WHERE
                  run_id = %s
                """
            cur.execute(sql_query, [run_id])
    except psycopg2.Error as e:
        logger.warning(e)
        raise RunDeleted(run_id)
예제 #5
0
def insert_permutation_mask(conn, project_id, run_id, mask_list):
    sql_insertion_query = """
        INSERT INTO permutation_masks
          (project, run, raw)
        VALUES
          (%s, %s, %s)
        """
    json_mask = psycopg2.extras.Json(mask_list)
    try:
        with conn.cursor() as cur:
            cur.execute(sql_insertion_query, [project_id, run_id, json_mask])
    except psycopg2.IntegrityError:
        raise RunDeleted(run_id)
예제 #6
0
def insert_mapping_result(db, run_id, mapping):
    try:
        with db.cursor() as cur:
            insertion_query = """
                INSERT into run_results
                  (run, result)
                VALUES
                  (%s, %s)
                RETURNING id;
                """
            result_id = execute_returning_id(cur, insertion_query, [run_id, psycopg2.extras.Json(mapping)])
    except psycopg2.IntegrityError as e:
        raise RunDeleted(run_id)
    return result_id
예제 #7
0
def insert_similarity_score_file(db, run_id, filename):
    with db.cursor() as cur:
        insertion_query = """
            INSERT into similarity_scores
              (run, file)
            VALUES
              (%s, %s)
            RETURNING id;
            """
        try:
            result_id = execute_returning_id(cur, insertion_query, [run_id, filename])
        except psycopg2.IntegrityError as e:
            raise RunDeleted(run_id)
    return result_id
예제 #8
0
def get_run_result(db, resource_id):
    """
    Return a Python dictionary mapping the index in A to
    the index in B.

    Note the response is mapping str -> int as both celery and
    postgres prefer keys to be strings.
    """
    sql_query = """
        SELECT result from run_results
        WHERE run = %s
        """
    query_result = query_db(db, sql_query, [resource_id], one=True)
    if query_result is None:
        raise RunDeleted(f"Run {resource_id} not found in database")
    return query_result['result']
예제 #9
0
def prerun_check(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Sanity check that we need to compute run")

    # being very defensive here checking if the run state is already in the redis cache
    if not is_run_missing(run_id):
        log.warning(
            "unexpectedly the run state is present in redis before starting")
        return

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.debug("Project not found. Skipping")
            raise ProjectDeleted(project_id)

        res = get_run(conn, run_id)
        if res is None:
            log.debug(f"Run not found. Skipping")
            raise RunDeleted(run_id)

        try:
            db_state = get_run_state_for_update(conn, run_id)
        except psycopg2.OperationalError:
            log.warning("Run started in another task. Skipping this race.")
            return

        if db_state in {'running', 'completed', 'error'}:
            log.warning("Run already started. Skipping")
            return

        log.debug("Setting run state in db as 'running'")
        update_run_set_started(conn, run_id)

        log.debug("Updating redis cache for run")
        set_run_state_active(run_id)

    create_comparison_jobs.apply_async(kwargs={
        'project_id':
        project_id,
        'run_id':
        run_id,
        'parent_span':
        prerun_check.get_serialized_span()
    },
                                       link_error=run_failed_handler.s())
    log.info("CLK similarity computation scheduled")
예제 #10
0
def store_similarity_scores(buffer, run_id, length, conn):
    """
    Stores the similarity scores above a similarity threshold as a CSV in minio.

    :param buffer: The file stream to store.
        Expected to be a line per link (e.g. a candidate match) containing 3 fields separated by a comma:
                                - the index of an entity from dataprovider 1
                                - the index of an entity from dataprovider 1
                                - the similarity score between 0 and 1 of the best match
    :param run_id:
    :param length: Number of candidate matches.
    :param conn: database connection to reuse.
    """
    log = logger.bind(run_id=run_id)
    filename = config.SIMILARITY_SCORES_FILENAME_FMT.format(run_id)

    log.info(
        "Storing similarity score results in CSV file: {}".format(filename))
    mc = connect_to_object_store()
    mc.put_object(config.MINIO_BUCKET,
                  filename,
                  data=buffer,
                  length=length,
                  content_type='application/csv')

    log.debug("Storing the CSV filename '{}' in the database".format(filename))
    try:
        result_id = insert_similarity_score_file(conn, run_id, filename)
    except psycopg2.IntegrityError:
        log.info(
            "Error saving similarity score filename to database. Suspect that project has been deleted"
        )
        raise RunDeleted(run_id)
    log.debug("Saved path to similarity scores file to db with id {}".format(
        result_id))
    return filename