예제 #1
0
def deduplication(columns=None, df=None):
    """
    If a df is passed, the deduplication metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param columns: Columns on which to run the metric, None to run the deduplication
        metric on the whole table.
    :type columns: list
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "deduplication"}
    if not (columns is None):
        params["columns"] = columns
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
예제 #2
0
def grouprule(columns, having, conditions=None, df=None):
    """
    If a df is passed, the groupRule metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param columns: Columns on which to run the metric, grouping data.
    :type columns: list
    :param conditions: Conditions on which to run the metric, filtering data before grouping, can be None.
    :type conditions: list
    :param having: Conditions to apply to groups.
    :type having: list
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "groupRule", "columns": columns, "having": having}
    if conditions is not None:
        params["conditions"] = conditions
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
예제 #3
0
def constraint(when, then, conditions=None, df=None):
    """
    If a df is passed, the constraint metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param when: A list of columns in the df to use as the precondition of a functional constraint. No column
        should be in both when and then.
    :param then: A list of columns in the df to use as the postcondition of a functional constraint. No column
        should be in both when and then.
    :param conditions: Conditions on which to filter data before applying the metric.
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "constraint", "when": when, "then": then}
    if conditions:
        params["conditions"] = conditions
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
예제 #4
0
def mutual_info(when, then, df=None):
    """
    If a df is passed, the mutual_info metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param when: First column on which to compute MI.
    :type when: str/int
    :param then: Second column on which to compute MI.
    :type then: str/int
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "mutual_info", "when": when, "then": then}
    # create tak containing parameters
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
예제 #5
0
def timeliness(columns, value, df=None, dateFormat=None, timeFormat=None):
    """
    If a df is passed, the timeliness metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).
    Use http://strftime.org/ directives to express formats.

    :param columns: Columns on which to run the metric, columns of type string will be casted to timestamp
        using the dateFormat or timeFormat argument.
    :type columns: list
    :param value: Value used to run the metric, confronting values in the specified columns against it.
    :type value: str
    :param dateFormat: Format in which the value (and values in columns, if they are of string type) are; used
        to cast columns if they contain dates as strings. Either dateFormat
        or timeFormat must be passed, but not both. Use http://strftime.org/ directives to express formats.
    :type dateFormat: str
    :param timeFormat: Format in which the value (and values in columns, if they are of string type) are; used
        to cast columns if they contain dates as strings. Either dateFormat
        or timeFormat must be passed, but not both. Use http://strftime.org/ directives to express formats.
    :type timeFormat: str
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    assert (dateFormat is None or timeFormat is None) and (
        not dateFormat is None or not timeFormat is None
    ), "Pass either a dateFormat or a timeFormat, not both."
    # make a dict representing the parameters
    params = {"metric": "timeliness", "columns": columns, "value": value}
    if dateFormat:
        params["dateFormat"] = dateFormat
    elif timeFormat:
        params["timeFormat"] = timeFormat
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
예제 #6
0
def rule(conditions, df=None):
    """
    If a df is passed, the rule metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).

    :param conditions: Conditions on which to run the metric.
    :type conditions: list
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "rule", "conditions": conditions}
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]
예제 #7
0
def freshness(columns, df=None, dateFormat=None, timeFormat=None):
    """
    If a df is passed, the freshness metric will be run and result returned
    as a list of scores, otherwise an instance of the Task class containing this
    metric wil be returned, to be later run (possibly after adding to it other tasks/metrics).
    Use http://strftime.org/ directives to express formats.

    :param columns: Columns on which to run the metric, columns of type string will be casted to timestamp
        using the dateFormat or timeFormat argument.
    :type columns: list
    :param dateFormat: Format in which the values in columns are if those columns are of type string; otherwise they must
        be of type timestamp. Use this parameter if you are interested in a result in terms of days.
        Either dateFormat or timeFormat must be passed, but not both.
        Use http://strftime.org/ directives to express formats.
    :type dateFormat: str
    :param timeFormat: Format in which the values in columns are if those columns are of type string; otherwise they must
        be of type timestamp. Use this parameter if you are interested in results in terms of seconds.
        Either dateFormat or timeFormat must be passed, but not both.
        Use http://strftime.org/ directives to express formats.
    :type timeFormat: str
    :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing
        this metric to be run later.
    :type df: DataFrame
    :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be
        run later.
    :rtype: list/Task
    """
    # make a dict representing the parameters
    params = {"metric": "freshness", "columns": columns}
    if dateFormat:
        params["dateFormat"] = dateFormat
    elif timeFormat:
        params["timeFormat"] = timeFormat
    t = task.Task([params])
    if df is None:
        return t
    else:
        return t.run(df)[0]["scores"]