Exemplo n.º 1
0
def get_values(
    taxonomy: str,
    tag: str,
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """ Returns all categories optionally filtered by taxonomy.

    **GET Parameters:**
      * limit ....... how many entries should we return?
      * offset....... starting at {offset}
      * taxonomy_id.. ID of the taxonomy to be looked up

    **Output (JSON List):**
      * category_name (string)
    """
    params = {
        'tag': tag,
        'taxonomy': taxonomy,
        'limit': limit,
        'offset': offset
    }

    SQL = """
      SELECT DISTINCT
        taxonomy_tag_val.id AS value_id, taxonomy_tag_val.value AS value
      FROM taxonomy_tag_val
      JOIN tags USING (tag_id)
      JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
      WHERE (taxonomy.name = %(taxonomy)s) AND (tags.tag_name = %(tag)s)
      LIMIT %(limit)s
      OFFSET %(offset)s
    """
    rows = execute_db(SQL, params, dict_=True)
    return rows
Exemplo n.º 2
0
    def test_execute_db_correct_stmt_on_closed_connection(self):
        config, _ = parse_test_db_config()
        connect_db(config)
        tag2domain_api.app.util.db._db_conn.close()

        rows = execute_db("SELECT 1;")

        assert len(rows) == 1
        assert rows[0][0] == 1
        disconnect_db()
Exemplo n.º 3
0
    def test_execute_db_correct_stmt(self):
        config, _ = parse_test_db_config()
        connection_args = Psycopg2Adapter.to_psycopg_args(config)
        conn = psycopg2.connect(**connection_args)
        set_db(conn)

        rows = execute_db("SELECT 1;")

        assert len(rows) == 1
        assert rows[0][0] == 1
        disconnect_db()
Exemplo n.º 4
0
def get_types():
    """ Returns filter categories defined in the filter table.

    **Output (JSON list):**
        str - name of filter
    """
    rows = execute_db("""
        SELECT DISTINCT ON (tag_name)
            tag_name
        FROM v_tag2domain_domain_filter
        ORDER BY tag_name
    """, ())
    return [name for name, in rows]
Exemplo n.º 5
0
def get_tags_by_domain(domain: str,
                       at_time: datetime.datetime = None,
                       limit: int = config['default_limit'],
                       offset: int = config['default_offset']):
    """ Returns all taxonomies and tags of a given {domain}

    **GET Parameters:**
      * domain ... the domain name to query (required)
      * at_time .. reference time to look at. If empty, open tags are returned.
            (YYYY-MM-DDTHH:mm:ss)
      * limit .... how many entries should we return?
      * offset.... starting at {offset}

    **Output (JSON):**
      * domain name... string
      * domain_id ... ID of the domain
      * tag_id... int
      * tag_name ... name of the tag
      * taxonomy_id ... int
      * taxonomy_name ... name of the linked taxonomy
    """
    parameters = {
        "domain": domain,
        "limit": limit,
        "offset": offset,
        "at_time": at_time
    }
    base_table, base_table_params = get_sql_base_table(at_time, domain=domain)
    parameters.update(base_table_params)

    SQL = """
            SELECT
              taxonomy_id,
              taxonomy.name AS taxonomy_name,
              tag_table.tag_id,
              tag_name,
              value_id,
              value,
              start_time,
              measured_at,
            end_time
            FROM %s AS tag_table -- base_table
            JOIN tags USING (tag_id)
            JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
            LEFT JOIN taxonomy_tag_val ON (tag_table.value_id = taxonomy_tag_val.id)
            ORDER BY domain_id, tag_table.tag_id asc
            LIMIT %%(limit)s OFFSET %%(offset)s""" % (base_table)
    rows = execute_db(SQL, parameters, dict_=True)
    return rows
Exemplo n.º 6
0
def get_stats_taxonomies(
    at_time: datetime.datetime = None,
    filter: str = Query(None, regex=RE_FILTER),
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """ Returns all taxonomies with the number of domains that
    are associated with at least one tag from this taxonomy.

    A filter is a string of the form Tag=Value. The available tags and values
    can be fetched using the /api/v1/filters/types and /api/v1/filters/values
    endpoints, respectively.

    **GET Parameters:**
      | parameter                   | description                                                                     |
      | --------------------------- | ------------------------------------------------------------------------------- |
      | at_time                     | point in time to query                                                          |
      | filter                      | filter domains by entries in the filter table before counting                   |
      | limit                       | limit answer to {limit} entries                                                 |
      | offset                      | start answer at the {offset}-th entry                                           |

    **Output (JSON):**
    JSON list of objects, ordered by count, descending. Each object has the following keys:
      | key                         | description                                                                     |
      | --------------------------- | ------------------------------------------------------------------------------- |
      | taxonomy_name               | name of the taxonomy                                                            |
      | count                       | number of domains labeled by at least one tag from this taxonomy                |
    """
    base_table, base_table_params = get_sql_base_table(at_time, filter)

    # Build the SQL statement
    SQL = """
      SELECT
        COUNT(DISTINCT domain_id), taxonomy.name AS taxonomy_name
        FROM %s AS tag_table -- base_table
        JOIN tags USING (tag_id)
        JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
        GROUP by taxonomy.id, taxonomy.name
        ORDER BY count DESC LIMIT %%(limit)s OFFSET %%(offset)s
      """ % (base_table)

    params = {
        'at_time': at_time,
        'limit': limit,
        'offset': offset
    }
    params.update(base_table_params)
    rows = execute_db(SQL, params, dict_=True)
    return rows
Exemplo n.º 7
0
def get_categories(
    taxonomy: Optional[str] = None,
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """ Returns all categories optionally filtered by taxonomy.

    **GET Parameters:**
      * limit ....... how many entries should we return?
      * offset....... starting at {offset}
      * taxonomy_id.. ID of the taxonomy to be looked up

    **Output (JSON List):**
      * category_name (string)
    """
    params = {
        'limit': limit,
        'offset': offset,
        'taxonomy': taxonomy
    }
    if taxonomy is None:
        taxonomy_clause = "TRUE"
    else:
        taxonomy_clause = "(taxonomy.name = %(taxonomy)s)"

    SQL = """
      SELECT DISTINCT ON (category)
        (REGEXP_REPLACE(tag_name, '^(.+)::.+$', '\\1')) AS category
      FROM tags
      JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
      WHERE STRPOS(tag_name, '::') != 0 AND {0}
      UNION ALL
      SELECT
        category
      FROM (
        SELECT
          '' AS category
          FROM tags
          JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
          WHERE
            STRPOS(tag_name, '::') = 0 AND {0}
          LIMIT 1
      ) AS t
      LIMIT %(limit)s
      OFFSET %(offset)s
    """.format(taxonomy_clause)
    rows = execute_db(SQL, params, dict_=False)
    return [_elem[0] for _elem in rows]
Exemplo n.º 8
0
    def test_reconnect_db(self):
        config, _ = parse_test_db_config()
        old_db_conn = connect_db(config)
        self.assertIsInstance(old_db_conn, psycopg2.extensions.connection)
        db_conn = connect_db(config)
        self.assertIsInstance(db_conn, psycopg2.extensions.connection)

        self.assertRaisesRegex(
            psycopg2.Error,
            "connection already closed",
            old_db_conn.cursor
        )

        rows = execute_db("SELECT 1;")

        assert len(rows) == 1
        assert rows[0][0] == 1
        disconnect_db()
Exemplo n.º 9
0
def get_values(filter: str):
    """ Returns filter values found in the filter table.

    **Output (JSON list):**
        str - value
    """
    rows = execute_db(
        """
        SELECT DISTINCT ON (value)
            value
        FROM v_tag2domain_domain_filter
        WHERE (tag_name = %s) AND (value IS NOT NULL)
        ORDER BY value
        """,
        (filter, )
    )

    return [name for name, in rows]
Exemplo n.º 10
0
def get_tag_history_by_domain(domain: str,
                              limit: int = config['default_limit'],
                              offset: int = config['default_offset']):
    """ Returns the tag history of a single domain.

    **GET Parameters:**
      * domain ... the domain name to query (required)
      * limit .... how many entries should we return?
      * offset.... starting at {offset}

    **Output (JSON):**
      * domain name... string
      * domain_id ... ID of the domain
      * tag_id... int
      * tag_name ... name of the tag
      * value_id ... ID of the value associated with the tag
      * value ... value associated with the tag
      * taxonomy_id ... int
      * taxonomy_name ... name of the linked taxonomy
    """
    parameters = {"domain": domain, "limit": limit, "offset": offset}

    SQL = """
      SELECT
         taxonomy_id,
         taxonomy.name AS taxonomy_name,
         tag_table.tag_id,
         tag_name,
         value_id,
         value,
         start_time,
         measured_at,
         end_time
      FROM tag2domain_get_all_tags_domain(%(domain)s) AS tag_table
      JOIN tags USING (tag_id)
      JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
      LEFT JOIN taxonomy_tag_val ON (tag_table.value_id = taxonomy_tag_val.id)
      ORDER BY domain_id, tag_id ASC
      LIMIT %(limit)s OFFSET %(offset)s
    """
    rows = execute_db(SQL, parameters, dict_=True)
    return rows
Exemplo n.º 11
0
def get_taxonomies(
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """ Returns all known taxonomies.

    **GET Parameters:**
      * limit .... how many entries should we return?
      * offset.... starting at {offset}

    **Output (JSON):**
      * id... integer
      * name ... name of the tag
      * description ... long description
      * is_actionable ... value between 0 and 1 on how actionable this is on
            data (can we automatically do an action based on the tag?)
      * is_automatically_classifiable... bool. Can we automatically tag a
            domain based on this taxonomy?
      * is_stable... bool. Does this taxonomy change a lot?
      * allows_auto_tags... bool. If a new tag appears, may it be automatically
            added to the taxonomy?
      * allows_auto_values... bool. If a new tag appears, may it be
            automatically added to the taxonomy?
      * for_numbers... bool. Is this taxonomy meant for numbers (IP addresses)?
      * for_domains... bool. Same but is it meant for domains?
      * url ... string. URL to further documentation.
    """
    SQL = """SELECT
                id,
                name,
                description,
                is_actionable,
                is_automatically_classifiable,
                is_stable,allows_auto_tags,
                for_numbers,
                for_domains,
                url
             FROM taxonomy ORDER BY id asc LIMIT %s OFFSET %s"""
    rows = execute_db(SQL, (limit, offset), dict_=True)
    return rows
Exemplo n.º 12
0
def get_stats_values(
    taxonomy: str,
    tag: str,
    at_time: datetime.datetime = None,
    filter: str = Query(None, regex=RE_FILTER),
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """ Returns stats on the values associated with a tag in a taxonomy.

    A filter is a string of the form Tag=Value. The available tags and values
    can be fetched using the /api/v1/filters/types and /api/v1/filters/values
    endpoints, respectively.

    **GET Parameters:**
      | parameter                   | description                                                                     |
      | --------------------------- | ------------------------------------------------------------------------------- |
      | taxonomy                    | name of taxonomy the tag to be looked up belongs to                             |
      | tag                         | name of the tag to be looked up                                                 |
      | at_time                     | point in time to query                                                          |
      | filter                      | filter domains by entries in the filter table before counting                   |
      | limit                       | limit answer to {limit} entries                                                 |
      | offset                      | start answer at the {offset}-th entry                                           |

    **Output (JSON):**
    JSON list of objects, ordered by count, descending. Each object has the following keys:
      | key                         | description                                                                     |
      | --------------------------- | ------------------------------------------------------------------------------- |
      | value                       | value                                                                           |
      | count                       | number of domains labeled by value and tag {tag}                                |
    """
    base_table, base_table_params = get_sql_base_table(at_time, filter)

    SQL = """
      SELECT
        value,
        COUNT(domain_id) AS count
      FROM (
        SELECT DISTINCT
          domain_id, value
        FROM %s AS tag_table -- base_table
        JOIN tags USING (tag_id)
        JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
        JOIN taxonomy_tag_val ON (tag_table.value_id = taxonomy_tag_val.id)
        WHERE
          (taxonomy.name = %%(taxonomy)s)
          AND (tag_name = %%(tag)s)
      ) AS t
      GROUP BY value
      ORDER BY count DESC
    """ % base_table
    params = {
        'taxonomy': taxonomy,
        'tag': tag,
        'at_time': at_time,
        'limit': limit,
        'offset': offset
    }
    params.update(base_table_params)
    rows = execute_db(SQL, params, dict_=True)
    return rows
Exemplo n.º 13
0
async def selftest():
    try:
        execute_db("SELECT 1", ())
    except Exception:
        raise HTTPException(status_code=503, detail="failed DB execute")
    return {"message": "OK"}
Exemplo n.º 14
0
def get_domains_by_tag(tag: str,
                       at_time: datetime.datetime = None,
                       filter_by_value: Optional[bool] = False,
                       value: Optional[str] = None,
                       filter: str = Query(None, regex=RE_FILTER),
                       limit: int = config['default_limit'],
                       offset: int = config['default_offset']):
    """ Returns all domains of a given {tag}

    A filter is a string of the form Tag=Value. The available tags and values
    can be fetched using the /api/v1/filters/types and /api/v1/filters/values
    endpoints, respectively.

    **GET Parameters:**
      * tag ... the tag name to query (required)
      * at_time .. reference time to look at. If empty, open tags are returned.
            (YYYY-MM-DDTHH:mm:ss)
      * limit .... how many entries should we return?
      * offset.... starting at {offset}

    **Output (JSON):**
      * domain name... string
      * domain_id ... ID of the domain
      * tag_id... int
      * tag_name ... name of the tag
      :type tag: str
    """
    base_table, base_table_params = get_sql_base_table(at_time, filter)
    parameters = {
        "tag_name": tag,
        "limit": limit,
        "offset": offset,
        "at_time": at_time,
        "value": value
    }
    parameters.update(base_table_params)

    whereclause_list = [
        "(tag_name = %(tag_name)s)",
    ]
    if filter_by_value:
        if value is None:
            whereclause_list.append("(value IS NULL)")
        elif isinstance(value, str):
            whereclause_list.append("(value = %(value)s)")
        else:
            raise ValueError("expected value to be str, got '%s'" %
                             str(type(value)))

    whereclause = ' AND '.join(whereclause_list)

    SQL = ("""
        SELECT
            domain_id,
            domain_name,
            tag_type,
            value,
            start_time,
            measured_at,
            end_time
        FROM %s AS tag_table -- base_table
        JOIN tags USING (tag_id)
        LEFT JOIN taxonomy_tag_val ON (tag_table.value_id = taxonomy_tag_val.id)
        WHERE (%s) -- whereclause
        ORDER BY domain_id, tag_table.tag_id ASC
        LIMIT %%(limit)s OFFSET %%(offset)s""" % (base_table, whereclause))
    rows = execute_db(SQL, parameters, dict_=True)
    return rows
Exemplo n.º 15
0
def get_domains_by_version(
        taxonomy: str,
        tag: str,
        version: Optional[str] = Query(
            None,
            regex="^(?:[0-9]+)(?:\.[0-9]+)*$"  # noqa: W605
        ),
        operator: VersionComparisonOperatorParameter = "=",
        at_time: datetime.datetime = None,
        filter: str = Query(None, regex=RE_FILTER),
        limit: int = config['default_limit'],
        offset: int = config['default_offset']):
    """ Return domains by the version of the tag {tag} in taxonomy {taxonomy}.

    Only tags with a value that looks like a version number (e.g. 1.2.3.4) will
    be returned. Versioning schemes with letters are not considered.

    A filter is a string of the form Tag=Value. The available tags and values
    can be fetched using the /api/v1/filters/types and /api/v1/filters/values
    endpoints, respectively.

    **GET Parameters:**
      | parameter         | description                                                                      |
      | ----------------- | -------------------------------------------------------------------------------- |
      | taxonomy          | taxonomy to query from                                                           |
      | tag               | tag to query from                                                                |
      | version           | version to compare to. Must be in format 1.2.3.4 ... or null                     |
      | operator          | operator used to compare version strings. One of =, <, <=, >, >=                 |
      | at_time           | point in time to query                                                           |
      | filter            | filter domains by entries in the filter table before counting                    |
      | limit             | limit answer to {limit} entries                                                  |
      | offset            | start answer at the {offset}-th entry                                            |

    **Output (JSON):**
    JSON list of objects, ordered by count, descending. Each object has the following keys:
      | key             | description                                                                     |
      | ----------------| ------------------------------------------------------------------------------- |
      | domain_id       | ID of the found domain                                                          |
      | domain_name     | name of the found domain                                                        |
      | version         | version found in the value field                                                |
      | start_time      | start time of the tag                                                           |
      | measured_at     | time of last measurement of the tag                                             |
      | end_time        | end time of the tag                                                             |

    """
    if version is None and operator != VersionComparisonOperatorParameter.equal:
        raise HTTPException(
            status_code=400,
            detail="null tags can only be searched with operator '='")

    parameters = {
        "taxonomy": taxonomy,
        "tag_name": tag,
        "version": version,
        "limit": limit,
        "offset": offset,
        "at_time": at_time
    }
    base_table, base_table_params = get_sql_base_table(at_time, filter)
    parameters.update(base_table_params)

    if version is None:
        value_clause = "(value IS NULL)"
    else:
        if operator == VersionComparisonOperatorParameter.equal:
            op = "="
        elif operator == VersionComparisonOperatorParameter.lessthan:
            op = "<"
        elif operator == VersionComparisonOperatorParameter.lessthanequal:
            op = "<="
        elif operator == VersionComparisonOperatorParameter.greaterthan:
            op = ">"
        elif operator == VersionComparisonOperatorParameter.greaterthanequal:
            op = ">="
        else:
            raise HTTPException(status_code=400, detail="invalid operator")

        if op == "=":
            value_clause = "(value = %(version)s)"
        else:
            value_clause = (
                "(value ~ '^(?:[0-9]+)(?:\.[0-9]+)*$')"  # noqa: W605
                " AND (string_to_array(value, '.')::bigint[] %s "
                "string_to_array(%%(version)s, '.')::bigint[])" % op)

    SQL = """
      SELECT
          domain_id,
          domain_name,
          start_time,
          measured_at,
          end_time,
          value AS version
      FROM %s AS tag_table -- base_table
      JOIN tags USING (tag_id)
      JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
      LEFT JOIN taxonomy_tag_val ON (tag_table.value_id = taxonomy_tag_val.id)
      WHERE
          (taxonomy.name = %%(taxonomy)s)
          AND (tag_name = %%(tag_name)s)
          AND (%s) -- value_clause
      ORDER BY domain_id, tag_table.tag_id ASC
      LIMIT %%(limit)s
      OFFSET %%(offset)s
    """ % (base_table, value_clause)
    rows = execute_db(SQL, parameters, dict_=True)
    return rows
Exemplo n.º 16
0
def get_domains_by_category(taxonomy: str,
                            category: Optional[str] = '',
                            at_time: datetime.datetime = None,
                            filter: str = Query(None, regex=RE_FILTER),
                            limit: int = config['default_limit'],
                            offset: int = config['default_offset']):
    """ Returns all domains of a given {taxonomy}

    A filter is a string of the form Tag=Value. The available tags and values
    can be fetched using the /api/v1/filters/types and /api/v1/filters/values
    endpoints, respectively.

    **GET Parameters:**
      * taxonomy ... the taxonomy name to query (required)
      * at_time .. reference time to look at. If empty, open tags are returned.
            (YYYY-MM-DDTHH:mm:ss)
      * limit .... how many entries should we return?
      * offset.... starting at {offset}

    **Output (JSON):**
      * domain name... string
      * domain_id ... ID of the domain
      * taxonomy_id... int
      * taxonomy_name ... name of the taxonomy
      :type taxonomy: str
    """
    parameters = {
        "taxonomy_name": taxonomy,
        "category": category,
        "limit": limit,
        "offset": offset,
        "at_time": at_time
    }
    base_table, base_table_params = get_sql_base_table(at_time, filter)
    parameters.update(base_table_params)

    SQL = """SELECT
                domain_id,
                domain_name,
                tag_id,
                tag_name,
                start_time,
                measured_at,
                end_time
             FROM %s AS tag_table -- base_table
             JOIN tags USING (tag_id)
             JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
             WHERE
              (taxonomy.name = %%(taxonomy_name)s)
              AND
              (
                ( -- tags with a category
                  (STRPOS(tag_name, '::') != 0)
                  AND (
                  (REGEXP_REPLACE(tag_name, '^(.+)::.+$', '\\1')) = %%(category)s)
                )
                OR
                ( -- tags without a category
                  (STRPOS(tag_name, '::') = 0)
                  AND (%%(category)s = '')
                )
              )
             ORDER BY domain_id, tag_id asc
             LIMIT %%(limit)s OFFSET %%(offset)s""" % base_table

    rows = execute_db(SQL, parameters, dict_=True)

    start = time.time()
    ret = []
    cur_domain = None
    for row in rows:
        if cur_domain is None or row["domain_id"] != cur_domain["domain_id"]:
            if cur_domain is not None:
                ret.append(cur_domain)
            cur_domain = {
                "domain_id": row["domain_id"],
                "domain_name": row["domain_name"],
                "tags": []
            }
        cur_domain["tags"].append({
            "tag_id": row["tag_id"],
            "tag_name": row["tag_name"],
            "start_time": row["start_time"],
            "measured_at": row["measured_at"],
            "end_time": row["end_time"]
        })
    if cur_domain is not None:
        ret.append(cur_domain)
    logger.debug("reshuffled results in %f s", time.time() - start)

    logger.debug("preparing response...")
    start = time.time()
    response = ORJSONResponse(content=ret)
    logger.debug("response prepared in %f s", time.time() - start)

    return response
Exemplo n.º 17
0
def get_stats_categories(
    taxonomy: str,
    at_time: datetime.datetime = None,
    filter: str = Query(None, regex=RE_FILTER),
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """Returns the categories within a taxonomy with the number of
    domains that are associated with at least one tag from the
    category.

    A filter is a string of the form Tag=Value. The available tags and values can be fetched using
    the /api/v1/filters/types and /api/v1/filters/values endpoints, respectively.

    **GET Parameters:**
    | parameter                   | description                                                                     |
    | --------------------------- | ------------------------------------------------------------------------------- |
    | taxonomy                    | name of taxonomy to be looked up                                                |
    | at_time                     | point in time to query                                                          |
    | filter                      | filter domains by entries in the filter table before counting                   |
    | limit                       | limit answer to {limit} entries                                                 |
    | offset                      | start answer at the {offset}-th entry                                           |

    **Output (JSON):**
    JSON list of objects, ordered by count, descending. Each object has the following keys:
    | key                         | description                                                                     |
    | --------------------------- | ------------------------------------------------------------------------------- |
    | category                    | name of the category                                                            |
    | count                       | number of domains  labeled by at least one tag from this category               |
    """
    base_table, base_table_params = get_sql_base_table(at_time, filter)

    SQL = """
      WITH t AS (
        SELECT
          domain_id,
          REGEXP_REPLACE(tag_name, '^(.+)::.+$', '\\1') AS category
        FROM %s AS tag_table -- base_table
        JOIN tags USING (tag_id)
        JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
        WHERE
          (taxonomy.name = %%(taxonomy)s)
          AND (STRPOS(tag_name, '::') != 0)
        UNION ALL
        SELECT
          domain_id,
          '' AS category
        FROM %s AS tag_table -- base_table
        JOIN tags USING (tag_id)
        JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
        WHERE
          (taxonomy.name = %%(taxonomy)s)
          AND (STRPOS(tag_name, '::') = 0)
      )
      SELECT
        category,
        COUNT(*) AS count
      FROM (
        SELECT DISTINCT ON (domain_id, category) category FROM t
      ) AS subquery
      GROUP BY category
      ORDER BY count DESC
      LIMIT %%(limit)s
        OFFSET %%(offset)s""" % (base_table, base_table)
    params = {
        'taxonomy': taxonomy,
        'at_time': at_time,
        'limit': limit,
        'offset': offset
    }
    params.update(base_table_params)
    rows = execute_db(SQL, params, dict_=True)
    return rows
Exemplo n.º 18
0
def get_tag_info(
    taxonomy: str,
    tag: str,
):
    """ Returns information about a single tag.

    **GET Parameters:**
    + taxonomy - name of the taxonomy the tag is in
    + tag - name of the tag to be fetched

    **Output (JSON Object):**
    + tag
      + name
      + description
      + category
      + extras
    + taxonomy
      + name
      + description
      + url
      + flags
        + is_actionable
        + is_automatically_classifiable
        + is_stable
        + for_numbers
        + for_domains
        + allows_auto_tags
        + allows_auto_values
    + values
      + count - number of distinct values that are associated with the tag

    See the [tag2domain docs](https://docu.labs.nic.at/doku.php?id=rd:topics:tag2domain:konzept)
    for further information about the fields.

    """
    params = {
        'tag': tag,
        'taxonomy': taxonomy
    }

    SQL = """
      SELECT
        tags.tag_name AS tag_name,
        tags.tag_description AS tag_description,
        REGEXP_REPLACE(tags.tag_name, '^(.+)::.+$', '\\1') AS tag_category,
        tags.extras AS tag_extras,
        taxonomy.name AS taxonomy_name,
        taxonomy.description AS taxonomy_description,
        taxonomy.url AS taxonomy_url,
        taxonomy.is_actionable AS taxonomy_flags_is_actionable,
        taxonomy.is_automatically_classifiable AS taxonomy_flags_is_automatically_classifiable,
        taxonomy.is_stable AS taxonomy_flags_is_stable,
        taxonomy.for_numbers AS taxonomy_flags_for_numbers,
        taxonomy.for_domains AS taxonomy_flags_for_domains,
        taxonomy.allows_auto_tags AS taxonomy_flags_allows_auto_tags,
        taxonomy.allows_auto_values AS taxonomy_flags_allows_auto_values,
        COUNT(DISTINCT taxonomy_tag_val.id) AS values_count
      FROM tags
      JOIN taxonomy ON (taxonomy.id = tags.taxonomy_id)
      LEFT JOIN taxonomy_tag_val ON (taxonomy_tag_val.tag_id = tags.tag_id)
      WHERE
        tags.tag_name = %(tag)s
        AND taxonomy.name = %(taxonomy)s
        AND (STRPOS(tag_name, '::') != 0)
      GROUP BY
      tags.tag_name, tags.tag_description, tags.extras,
      taxonomy.name, taxonomy.description, taxonomy.description,
      taxonomy.url, taxonomy.is_actionable, taxonomy.is_automatically_classifiable,
      taxonomy.is_stable, taxonomy.for_numbers, taxonomy.for_domains,
      taxonomy.allows_auto_tags, taxonomy.allows_auto_values
      UNION ALL
      SELECT
        tags.tag_name AS tag_name,
        tags.tag_description AS tag_description,
        NULL AS tag_category,
        tags.extras AS tag_extras,
        taxonomy.name AS taxonomy_name,
        taxonomy.description AS taxonomy_description,
        taxonomy.url AS taxonomy_url,
        taxonomy.is_actionable AS taxonomy_flags_is_actionable,
        taxonomy.is_automatically_classifiable AS taxonomy_flags_is_automatically_classifiable,
        taxonomy.is_stable AS taxonomy_flags_is_stable,
        taxonomy.for_numbers AS taxonomy_flags_for_numbers,
        taxonomy.for_domains AS taxonomy_flags_for_domains,
        taxonomy.allows_auto_tags AS taxonomy_flags_allows_auto_tags,
        taxonomy.allows_auto_values AS taxonomy_flags_allows_auto_values,
        COUNT(DISTINCT taxonomy_tag_val.id) AS values_count
      FROM tags
      JOIN taxonomy ON (taxonomy.id = tags.taxonomy_id)
      LEFT JOIN taxonomy_tag_val ON (taxonomy_tag_val.tag_id = tags.tag_id)
      WHERE
        tags.tag_name = %(tag)s
        AND taxonomy.name = %(taxonomy)s
        AND (STRPOS(tag_name, '::') = 0)
      GROUP BY
      tags.tag_name, tags.tag_description, tags.extras,
      taxonomy.name, taxonomy.description, taxonomy.description,
      taxonomy.url, taxonomy.is_actionable, taxonomy.is_automatically_classifiable,
      taxonomy.is_stable, taxonomy.for_numbers, taxonomy.for_domains,
      taxonomy.allows_auto_tags, taxonomy.allows_auto_values
    ;
    """
    rows = execute_db(SQL, params, dict_=True)

    if len(rows) == 0:
        raise HTTPException(
            status_code=404,
            detail="no tag '%s' in taxonomy '%s' found" % (tag, taxonomy)
        )
    elif len(rows) > 1:
        raise HTTPException(status_code=500, detail="more than 1 tag found")

    row, = rows

    ret = {
        'tag': {
            'name': row['tag_name'],
            'description': row['tag_description'],
            'category': row['tag_category'],
            'extras': row['tag_extras'],
        },
        'taxonomy': {
            'name': row['taxonomy_name'],
            'description': row['taxonomy_description'],
            'url': row['taxonomy_url'],
            'flags': {
                'is_actionable': row['taxonomy_flags_is_actionable'],
                'is_automatically_classifiable': row['taxonomy_flags_is_automatically_classifiable'],
                'is_stable': row['taxonomy_flags_is_actionable'],
                'for_numbers': row['taxonomy_flags_for_numbers'],
                'for_domains': row['taxonomy_flags_for_domains'],
                'allows_auto_tags': row['taxonomy_flags_allows_auto_tags'],
                'allows_auto_values': row['taxonomy_flags_allows_auto_values']
            }
        },
        'values': {
            'count': row["values_count"]
        }
    }

    return ret
Exemplo n.º 19
0
def get_stats_bycategory(
    taxonomy: str,
    category: Optional[str] = None,
    at_time: datetime.datetime = None,
    filter: str = Query(None, regex=RE_FILTER),
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """ Returns stats on all tags that are within a taxonomy, optionally
    filtered by categories.

    A filter is a string of the form Tag=Value. The available tags and values
    can be fetched using the /api/v1/filters/types and /api/v1/filters/values
    endpoints, respectively.

    **GET Parameters:**
    | parameter          | description                                                                                                          |
    | ------------------ | -------------------------------------------------------------------------------------------------------------------- |
    | taxonomy           | taxonomy to calculate stats for                                                                                      |
    | category           | filter for this category. If not given all tags in taxonomy are returned. Set empty string for tags in root category |
    | at_time            | point in time to query                                                                                               |
    | filter             | filter domains by entries in the filter table before counting                                                        |
    | limit              | limit answer to {limit} entries                                                                                      |
    | offset             | start answer at the {offset}-th entry                                                                                |

    **Output (JSON):**
    JSON list of objects, ordered by count, descending. Each object has the following keys:
    | key                | description                                                                     |
    | ------------------ | ------------------------------------------------------------------------------- |
    | tag_name           | name of the tag                                                                 |
    | count              | number of domains labeled by this tag in taxonomy {taxonomy}                    |
    """
    base_table, base_table_params = get_sql_base_table(at_time, filter)

    if category is not None:
        category_clause = """
          (
            ( -- tags with a category
              (STRPOS(tag_name, '::') != 0)
              AND (
                (REGEXP_REPLACE(tag_name, '^(.+)::.+$', '\\1')) = %(category)s
              )
            )
            OR
            ( -- tags without a category
              (STRPOS(tag_name, '::') = 0)
              AND (%(category)s = '')
            )
          )
        """
    else:
        category_clause = "TRUE"

    SQL = """
      SELECT
        tag_name,
          COUNT(DISTINCT domain_id) AS count
        FROM %s AS tag_table -- base_table
        JOIN tags USING (tag_id)
        JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
      WHERE
        (taxonomy.name = %%(taxonomy)s)
        AND (%s) -- category clause
      GROUP BY tag_id, tag_name
      ORDER BY count DESC
      """ % (base_table, category_clause)
    params = {
        'taxonomy': taxonomy,
        'category': category,
        'at_time': at_time,
        'limit': limit,
        'offset': offset
    }
    params.update(base_table_params)
    rows = execute_db(SQL, params, dict_=True)
    return rows
Exemplo n.º 20
0
def get_tags(
    taxonomy: str = None,
    category: str = None,
    limit: int = config['default_limit'],
    offset: int = config['default_offset']
):
    """ Returns all known tags.

    **GET Parameters:**
      * limit .... how many entries should we return?
      * offset.... starting at {offset}

    **Output (JSON):**
      * tag_id... integer
      * tag_name ... name of the tag
      * tag_description ... long description
      * taxonomy_id ... id of the linked taxonomy
      * extras... JSON dict of potential extra information
    """
    params = {
        'taxonomy': taxonomy,
        'category': category,
        'limit': limit,
        'offset': offset
    }

    if taxonomy is not None:
        taxonomy_where_clause = "(taxonomy.name = %(taxonomy)s)"
    else:
        taxonomy_where_clause = "True"

    if category is not None:
        if taxonomy is None:
            raise HTTPException(
                status_code=400,
                detail="querying category tags requires taxonomy to be set"
            )

        if category == "":
            category_where_clause = "(STRPOS(tag_name, '::') = 0)"
        else:
            category_where_clause = (
                "(STRPOS(tag_name, '::') != 0) "
                "AND ("
                "(REGEXP_REPLACE(tag_name,'^(.+)::.+$', '\\1')) = %(category)s"
                ")"
            )
    else:
        category_where_clause = "True"

    SQL = """
      SELECT tag_id,tag_name,tag_description,taxonomy_id,extras
      FROM tags
      JOIN taxonomy ON (tags.taxonomy_id = taxonomy.id)
      WHERE ( %s ) AND ( %s ) -- taxonomy_where_clause, category_where_clause
      ORDER BY tag_id ASC
      LIMIT %%(limit)s
      OFFSET %%(offset)s
    """ % (taxonomy_where_clause, category_where_clause)
    rows = execute_db(SQL, params, dict_=True)
    return rows