def mutual_info(when, then, df=None): """ If a df is passed, the mutual_info metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). :param when: First column on which to compute MI. :type when: str/int :param then: Second column on which to compute MI. :type then: str/int :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be run later. :rtype: list/Task """ # make a dict representing the parameters params = {"metric": "mutual_info", "when": when, "then": then} # create tak containing parameters t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]
def grouprule(columns, having, conditions=None, df=None): """ If a df is passed, the groupRule metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). :param columns: Columns on which to run the metric, grouping data. :param conditions: Conditions on which to run the metric, filtering data before grouping, can be None. :type conditions: list :param having: Conditions to apply to groups. :type having: list :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be run later. :rtype: list/Task """ # make a dict representing the parameters params = {"metric": "groupRule", "columns": columns, "having": having} if conditions is not None: params["conditions"] = conditions t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]
def deduplication(columns=None, df=None): """ If a df is passed, the deduplication metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). :param columns: Columns on which to run the metric, None to run the deduplication metric on the whole table (deduplication on rows). :type columns: list :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be run later. :rtype: list/Task """ # make a dict representing the parameters params = {"metric": "deduplication"} if not (columns is None): params["columns"] = columns t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]
def constraint(when, then, conditions=None, df=None): """ If a df is passed, the constraint metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). :param when: A list of columns in the df to use as the precondition of a functional constraint. No column should be in both when and then. :type when: list :param then: A list of columns in the df to use as the postcondition of a functional constraint. No column should be in both when and then. :type then: list :param conditions: Conditions on which to filter data before applying the metric. :type conditions: list :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be run later. :rtype: list/Task """ # make a dict representing the parameters params = {"metric": "constraint", "when": when, "then": then} if conditions: params["conditions"] = conditions t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]
def deduplication_approximated(columns, df=None): """ If a df is passed, the deduplication_approximated metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). Differently from deduplication, here columns must be specified (deduplication_approximated does not work on a whole row level). :param columns: Columns on which to run the metric. :type columns: list :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be run later. :rtype: list/Task """ # make a dict representing the parameters params = {"metric": "deduplication_approximated"} if not (columns is None): params["columns"] = columns t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]
def timeliness(columns, value, df=None, dateFormat=None, timeFormat=None): """ If a df is passed, the timeliness metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats. :param columns: Columns on which to run the metric, columns of type string will be casted to timestamp using the dateFormat or timeFormat argument. :type columns: list :param value: Value used to run the metric, confronting values in the specified columns against it. :type value: str :param dateFormat: Format in which the value (and values in columns, if they are of string type) are; used to cast columns if they contain dates as strings. Either dateFormat or timeFormat must be passed, but not both. Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats. :type dateFormat: str :param timeFormat: Format in which the value (and values in columns, if they are of string type) are; used to cast columns if they contain dates as strings. Either dateFormat or timeFormat must be passed, but not both. Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats. :type timeFormat: str :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be run later. :rtype: list/Task """ assert (dateFormat is None or timeFormat is None) and ( not dateFormat is None or not timeFormat is None ), "Pass either a dateFormat or a timeFormat, not both." # make a dict representing the parameters params = {"metric": "timeliness", "columns": columns, "value": value} if dateFormat: params["dateFormat"] = dateFormat elif timeFormat: params["timeFormat"] = timeFormat t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]
def rule(conditions, df=None): """ If a df is passed, the rule metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). :param conditions: Conditions on which to run the metric. :type conditions: list :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be :rtype: list/Task """ # make a dict representing the parameters params = {"metric": "rule", "conditions": conditions} t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]
def freshness(columns, df=None, dateFormat=None, timeFormat=None): """ If a df is passed, the freshness metric will be run and result returned as a list of scores, otherwise an instance of the Task class containing this metric wil be returned, to be later run (possibly after adding to it other tasks/metrics). Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats. :param columns: Columns on which to run the metric, columns of type string will be casted to timestamp using the dateFormat or timeFormat argument. :type columns: list :param dateFormat: Format in which the values in columns are if those columns are of type string; otherwise they must be of type date or timestamp. Use this parameter if you are interested in a result in terms of days. Either dateFormat or timeFormat must be passed, but not both. Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats. :type dateFormat: str :param timeFormat: Format in which the values in columns are if those columns are of type string; otherwise they must be of type timestamp. Use this parameter if you are interested in results in terms of seconds. Either dateFormat or timeFormat must be passed, but not both. Use https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html directives to express formats. :type timeFormat: str :param df: Dataframe on which to run the metric, None to have this function return a Task instance containing this metric to be run later. :type df: DataFrame :return: Either a list of scores or a Task instance containing this metric (with these parameters) to be run later. :rtype: list/Task """ # make a dict representing the parameters params = {"metric": "freshness", "columns": columns} if dateFormat: params["dateFormat"] = dateFormat elif timeFormat: params["timeFormat"] = timeFormat t = task.Task([params]) if df is None: return t else: return t.run(df)[0]["scores"]