Exemplo n.º 1
0
 def load_data(self,
               name,
               f_path,
               f_name,
               get_tid,
               get_attr,
               get_val,
               na_values=None):
     tic = time.clock()
     try:
         raw_data = pd.read_csv(os.path.join(f_path, f_name),
                                na_values=na_values)
         raw_data.fillna('_nan_', inplace=True)
         raw_data['_tid_'] = raw_data.apply(get_tid, axis=1)
         raw_data['_attribute_'] = raw_data.apply(get_attr, axis=1)
         raw_data['_value_'] = raw_data.apply(get_val, axis=1)
         raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
         # Normalize string to lower-case and strip whitespaces.
         raw_data['_attribute_'] = raw_data['_attribute_'].apply(
             lambda x: x.lower())
         raw_data['_value_'] = raw_data['_value_'].apply(
             lambda x: x.strip())
         self.clean_data = Table(name, Source.DF, raw_data)
         self.clean_data.store_to_db(self.ds.engine.engine)
         self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
         self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
         status = 'DONE Loading ' + f_name
     except Exception:
         logging.error('load_data for table %s', name)
         raise
     toc = time.clock()
     load_time = toc - tic
     return status, load_time
Exemplo n.º 2
0
    def load_data(self,
                  name,
                  fpath,
                  tid_col,
                  attr_col,
                  val_col,
                  na_values=None):
        tic = time.clock()
        try:
            raw_data = pd.read_csv(fpath,
                                   na_values=na_values,
                                   dtype=str,
                                   encoding='utf-8')
            # We drop any ground truth values that are NULLs since we follow
            # the closed-world assumption (if it's not there it's wrong).
            # TODO: revisit this once we allow users to specify which
            # attributes may be NULL.
            raw_data.dropna(subset=[val_col], inplace=True)
            raw_data.rename(
                {
                    tid_col: '_tid_',
                    attr_col: '_attribute_',
                    val_col: '_value_'
                },
                axis='columns',
                inplace=True)
            raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
            raw_data['_tid_'] = raw_data['_tid_'].astype(int)

            # Normalize string to whitespaces.
            categorical_attrs = self.ds.categorical_attrs
            if categorical_attrs:
                cat_cells = raw_data['_attribute_'].isin(categorical_attrs)
                raw_data.loc[cat_cells, '_value_'] = \
                    raw_data.loc[cat_cells, '_value_'].astype(str).str.strip().str.lower()

            self.clean_data = Table(name, Source.DF, df=raw_data)
            self.clean_data.store_to_db(self.ds.engine.engine)
            self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
            self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
            status = 'DONE Loading {fname}'.format(
                fname=os.path.basename(fpath))
        except Exception:
            logging.error('load_data for table %s', name)
            raise
        toc = time.clock()
        load_time = toc - tic
        return status, load_time
Exemplo n.º 3
0
    def __init__(self,
                 fpath=None,
                 df=None,
                 db_engine=None,
                 table_name=None,
                 schema_name=None,
                 id_col="_tid_",
                 attr_col="attribute",
                 name="ErrorLoaderDetector"):
        """
        :param fpath: (str) Path to source csv file to load errors
        :param df: (DataFrame) datarame containing the errors
        :param db_engine: (DBEngine) Database engine object
        :param table_name: (str) Relational table considered for loading errors
        :param schema_name: (str) Schema in which :param table_name: exists
        :param id_col: (str) ID column name
        :param attr_col: (str) Attribute column name
        :param name: (str) name of the detector

        To load from csv file, :param fpath: must be specified.
        To load from a relational table, :param db_engine:, and 
        :param table_name: must be specified, optionally specifying :param schema_name:.
        """
        super(ErrorsLoaderDetector, self).__init__(name)
        src = None
        dataset_name = None
        if fpath is not None:
            dataset_name = "errors_file"
            src = Source.FILE
        elif df is not None:
            dataset_name = "errors_df"
            src = Source.DF
        elif (db_engine is not None) and (table_name is not None):
            dataset_name = table_name
            src = Source.DB
        else:
            raise Exception(
                "ERROR while intializing ErrorsLoaderDetector. Please provide (<fpath>), (<db_engine> and <table_name>), OR <df>"
            )

        self.errors_table = Table(dataset_name,
                                  src,
                                  exclude_attr_cols=[attr_col],
                                  fpath=fpath,
                                  df=df,
                                  schema_name=schema_name,
                                  db_engine=db_engine)

        expected_schema = [id_col, attr_col]
        if list(self.errors_table.df.columns) != expected_schema:
            raise Exception(
                "ERROR while intializing ErrorsLoaderDetector: The loaded errors table does not match the expected schema of {}"
                .format(expected_schema))

        self.errors_table.df = self.errors_table.df.astype({
            id_col: int,
            attr_col: str
        })
    def create_table(self,
                     table_name,
                     primary_id=None,
                     primary_type=None,
                     primary_increment=None):
        """Create a new table.

        Either loads a table or creates it if it doesn't exist yet. You can
        define the name and type of the primary key field, if a new table is to
        be created. The default is to create an auto-incrementing integer,
        ``id``. You can also set the primary key to be a string or big integer.
        The caller will be responsible for the uniqueness of ``primary_id`` if
        it is defined as a text type. You can disable auto-increment behaviour
        for numeric primary keys by setting `primary_increment` to `False`.

        Returns a :py:class:`Table <dataset.Table>` instance.
        ::

            table = db.create_table('population')

            # custom id and type
            table2 = db.create_table('population2', 'age')
            table3 = db.create_table('population3',
                                     primary_id='city',
                                     primary_type=db.types.text)
            # custom length of String
            table4 = db.create_table('population4',
                                     primary_id='city',
                                     primary_type=db.types.string(25))
            # no primary key
            table5 = db.create_table('population5',
                                     primary_id=False)
        """
        assert not isinstance(
            primary_type,
            str), "Text-based primary_type support is dropped, use db.types."
        table_name = normalize_table_name(table_name)
        with self.lock:
            if table_name not in self._tables:
                self._tables[table_name] = Table(
                    self,
                    table_name,
                    primary_id=primary_id,
                    primary_type=primary_type,
                    primary_increment=primary_increment,
                    auto_create=True,
                )
            return self._tables.get(table_name)
Exemplo n.º 5
0
    def load_table(self, table_name):
        """Load a table.

        This will fail if the tables does not already exist in the database. If
        the table exists, its columns will be reflected and are available on
        the :py:class:`Table <dataset.Table>` object.

        Returns a :py:class:`Table <dataset.Table>` instance.
        ::

            table = db.load_table('population')
        """
        table_name = normalize_table_name(table_name)
        with self.lock:
            if table_name not in self._tables:
                self._tables[table_name] = Table(self, table_name)
            return self._tables.get(table_name)
Exemplo n.º 6
0
class EvalEngine:
    def __init__(self, env, dataset):
        self.env = env
        self.ds = dataset

    def load_data(self,
                  name,
                  f_path,
                  f_name,
                  get_tid,
                  get_attr,
                  get_val,
                  na_values=None):
        tic = time.clock()
        try:
            raw_data = pd.read_csv(os.path.join(f_path, f_name),
                                   na_values=na_values)
            raw_data.fillna('_nan_', inplace=True)
            raw_data['_tid_'] = raw_data.apply(get_tid, axis=1)
            raw_data['_attribute_'] = raw_data.apply(get_attr, axis=1)
            raw_data['_value_'] = raw_data.apply(get_val, axis=1)
            raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
            # Normalize string to lower-case and strip whitespaces.
            raw_data['_attribute_'] = raw_data['_attribute_'].apply(
                lambda x: x.lower())
            raw_data['_value_'] = raw_data['_value_'].apply(
                lambda x: x.strip())
            self.clean_data = Table(name, Source.DF, raw_data)
            self.clean_data.store_to_db(self.ds.engine.engine)
            self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
            self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
            status = 'DONE Loading ' + f_name
        except Exception:
            logging.error('load_data for table %s', name)
            raise
        toc = time.clock()
        load_time = toc - tic
        return status, load_time

    def evaluate_repairs(self):
        self.compute_total_repairs()
        self.compute_total_repairs_grdt()
        self.compute_total_errors()
        self.compute_detected_errors()
        self.compute_correct_repairs()
        prec = self.compute_precision()
        rec = self.compute_recall()
        rep_recall = self.compute_repairing_recall()
        f1 = self.compute_f1()
        rep_f1 = self.compute_repairing_f1()
        return prec, rec, rep_recall, f1, rep_f1

    def eval_report(self):
        tic = time.clock()
        prec, rec, rep_recall, f1, rep_f1 = self.evaluate_repairs()
        report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs (Grdth present) = %d" % (
            prec, rec, rep_recall, f1, rep_f1, self.detected_errors,
            self.total_errors, self.correct_repairs, self.total_repairs,
            self.total_repairs_grdt)
        toc = time.clock()
        report_time = toc - tic
        return report, report_time

    def compute_total_repairs(self):
        query = "SELECT count(*) FROM " \
                "(SELECT _vid_ " \
                 "FROM %s as t1, %s as t2 " \
                 "WHERE t1._tid_ = t2._tid_ " \
                   "AND t1.attribute = t2.attribute " \
                   "AND t1.init_value != t2.rv_value) AS t"\
                %(AuxTables.cell_domain.name, AuxTables.inf_values_dom.name)
        res = self.ds.engine.execute_query(query)
        self.total_repairs = float(res[0][0])

    def compute_total_repairs_grdt(self):
        query = "SELECT count(*) FROM " \
                "(SELECT _vid_ " \
                 "FROM %s as t1, %s as t2, %s as t3 " \
                 "WHERE t1._tid_ = t2._tid_ " \
                   "AND t1.attribute = t2.attribute " \
                   "AND t1.init_value != t2.rv_value " \
                   "AND t1._tid_ = t3._tid_ " \
                   "AND t1.attribute = t3._attribute_) AS t"\
                %(AuxTables.cell_domain.name, AuxTables.inf_values_dom.name, self.clean_data.name)
        res = self.ds.engine.execute_query(query)
        self.total_repairs_grdt = float(res[0][0])

    def compute_total_errors(self):
        queries = []
        total_errors = 0.0
        for attr in self.ds.get_attributes():
            query = errors_template.substitute(
                init_table=self.ds.raw_data.name,
                grdt_table=self.clean_data.name,
                attr=attr)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)
        for res in results:
            total_errors += float(res[0][0])
        self.total_errors = total_errors

    def compute_total_errors_grdt(self):
        queries = []
        total_errors = 0.0
        for attr in self.ds.get_attributes():
            query = errors_template.substitute(
                init_table=self.ds.raw_data.name,
                grdt_table=self.clean_data.name,
                attr=attr)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)
        for res in results:
            total_errors += float(res[0][0])
        self.total_errors = total_errors

    def compute_detected_errors(self):
        query = "SELECT count(*) FROM " \
                "(SELECT _vid_ " \
                "FROM %s as t1, %s as t2, %s as t3 " \
                "WHERE t1._tid_ = t2._tid_ AND t1._cid_ = t3._cid_ " \
                "AND t1.attribute = t2._attribute_ " \
                "AND t1.init_value != t2._value_) AS t" \
                % (AuxTables.cell_domain.name, self.clean_data.name, AuxTables.dk_cells.name)
        res = self.ds.engine.execute_query(query)
        self.detected_errors = float(res[0][0])

    def compute_correct_repairs(self):
        queries = []
        correct_repairs = 0.0
        for attr in self.ds.get_attributes():
            query = correct_repairs_template.substitute(
                init_table=self.ds.raw_data.name,
                grdt_table=self.clean_data.name,
                attr=attr,
                inf_dom=AuxTables.inf_values_dom.name)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)
        for res in results:
            correct_repairs += float(res[0][0])
        self.correct_repairs = correct_repairs

    def compute_recall(self):
        return self.correct_repairs / self.total_errors

    def compute_repairing_recall(self):
        return self.correct_repairs / self.detected_errors

    def compute_precision(self):
        return self.correct_repairs / self.total_repairs_grdt

    def compute_f1(self):
        prec = self.compute_precision()
        rec = self.compute_recall()
        f1 = 2 * (prec * rec) / (prec + rec)
        return f1

    def compute_repairing_f1(self):
        prec = self.compute_precision()
        rec = self.compute_repairing_recall()
        f1 = 2 * (prec * rec) / (prec + rec)
        return f1
Exemplo n.º 7
0
class EvalEngine:
    def __init__(self, env, dataset):
        self.env = env
        self.ds = dataset

    def load_data(self,
                  name,
                  fpath,
                  tid_col,
                  attr_col,
                  val_col,
                  na_values=None):
        tic = time.clock()
        try:
            raw_data = pd.read_csv(fpath,
                                   na_values=na_values,
                                   dtype=str,
                                   encoding='utf-8')
            # We drop any ground truth values that are NULLs since we follow
            # the closed-world assumption (if it's not there it's wrong).
            # TODO: revisit this once we allow users to specify which
            # attributes may be NULL.
            raw_data.dropna(subset=[val_col], inplace=True)
            raw_data.rename(
                {
                    tid_col: '_tid_',
                    attr_col: '_attribute_',
                    val_col: '_value_'
                },
                axis='columns',
                inplace=True)
            raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
            raw_data['_tid_'] = raw_data['_tid_'].astype(int)

            # Normalize string to whitespaces.
            categorical_attrs = self.ds.categorical_attrs
            if categorical_attrs:
                cat_cells = raw_data['_attribute_'].isin(categorical_attrs)
                raw_data.loc[cat_cells, '_value_'] = \
                    raw_data.loc[cat_cells, '_value_'].astype(str).str.strip().str.lower()

            self.clean_data = Table(name, Source.DF, df=raw_data)
            self.clean_data.store_to_db(self.ds.engine.engine)
            self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
            self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
            status = 'DONE Loading {fname}'.format(
                fname=os.path.basename(fpath))
        except Exception:
            logging.error('load_data for table %s', name)
            raise
        toc = time.clock()
        load_time = toc - tic
        return status, load_time

    def eval_report(self, attr=None):
        """
        Returns an EvalReport named tuple containing the experiment results.
        :param attr: if attr is not None, compute results for attr:
                        if attr is numerical, then only report rmse
                        if attr is categorical, then report precision, recall etc.
                     if attr is None, compute results for all attrs
        """
        tic = time.clock()
        eval_report_dict = {}
        # attr is not None and is numerical
        # or attr is None(query on all attrs) and no categorical
        if attr is None or attr in self.ds.numerical_attrs:
            eval_report_dict['rmse'] = self.compute_rmse(attr) or 0.

        if attr is None or attr in self.ds.categorical_attrs:
            # if attr in self.ds.categorical_attrs or attr is None
            # attr is not None and is categorical
            # attr is None and should query on all
            correct_repairs = self.compute_correct_repairs(attr)
            total_repairs = self.compute_total_repairs(attr)
            detected_errors = self.compute_detected_errors(attr)
            total_errors = self.compute_total_errors(attr)
            total_repairs_grdt_correct, total_repairs_grdt_incorrect, \
                total_repairs_grdt = self.compute_total_repairs_grdt(attr)

            eval_report_dict['detected_errors'] = detected_errors
            eval_report_dict['total_errors'] = total_errors
            eval_report_dict['correct_repairs'] = correct_repairs
            eval_report_dict['total_repairs'] = total_repairs
            eval_report_dict['total_repairs_grdt'] = total_repairs_grdt
            eval_report_dict[
                'total_repairs_grdt_correct'] = total_repairs_grdt_correct
            eval_report_dict[
                'total_repairs_grdt_incorrect'] = total_repairs_grdt_incorrect

            eval_report_dict['precision'] = self.compute_precision(
                correct_repairs, total_repairs_grdt)
            eval_report_dict['recall'] = self.compute_recall(
                correct_repairs, total_errors)
            eval_report_dict['repair_recall'] = self.compute_repairing_recall(
                correct_repairs, detected_errors)
            eval_report_dict['f1'] = self.compute_f1(correct_repairs,
                                                     total_errors,
                                                     total_repairs_grdt)
            eval_report_dict['repair_f1'] = self.compute_repairing_f1(
                correct_repairs, detected_errors, total_repairs_grdt)

        report = EvalReport(**eval_report_dict)
        report_str = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, " \
                     "F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, " \
                     "Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, " \
                     "Total Repairs on correct cells (Grdth present) = %d, " \
                     "Total Repairs on incorrect cells (Grdth present) = %d, " \
                     "RMSE = %.2f" % (report.precision, report.recall, report.repair_recall, report.f1, report.repair_f1,
                                     report.detected_errors, report.total_errors, report.correct_repairs,
                                     report.total_repairs,
                                     report.total_repairs_grdt_correct, report.total_repairs_grdt_incorrect,
                                     report.rmse)

        if attr:
            report_str = "# Attribute:{};{}".format(attr, report_str)

        toc = time.clock()
        report_time = toc - tic
        return report_str, report_time, report

    """
    All these compute_xxx methods are aimed at categorical attributes
    All numerical attrs should not be involved in computing precision, recall, etc.
    """

    def get_categorical_clause(self, attr):
        """
        Get where condition on which attr should be queried
        :param attr: if attr is None, generate condition on all categorical attrs
        :return: str type attr_clause showing: 'AND t1.attribute = attr' to only do query on target attr
                if attr is None, then 'AND (t1.attribute = attr1 OR t1.attribute = attr2 ...) for all categorical attrs
        """
        query_attrs = [attr] if attr else self.ds.categorical_attrs
        query_attrs_str = ["\'{}\'".format(attr) for attr in query_attrs]
        categorical_where = 't1.attribute IN (%s)' % ','.join(query_attrs_str)

        return categorical_where

    def compute_total_repairs(self, attr=None):
        """
        compute_total_repairs memoizes the number of repairs:
        the # of cells that were inferred and where the inferred value
        is not equal to the initial value.
        :param attr: if attr is not None, it must be categorical
        """
        assert attr is None or attr in self.ds.categorical_attrs

        if not self.ds.categorical_attrs:
            return 0.
        # do not query on numerical attrs (when attr is None, indicating we want query on all categorical attrs)
        # if there are no numerical attrs, then no condition should be added, just query on all attributes
        attr_clause = self.get_categorical_clause(
            attr) if self.ds.numerical_attrs else "TRUE"
        query = "SELECT count(*) FROM " \
                "  (SELECT _vid_ " \
                '     FROM "{}" as t1, "{}" as t2 ' \
                "    WHERE t1._tid_ = t2._tid_ " \
                "      AND t1.attribute = t2.attribute " \
                "      AND t1.init_value != t2.rv_value " \
                "      AND {}) AS t".format(AuxTables.cell_domain.name,
                                            AuxTables.inf_values_dom.name,
                                            attr_clause)
        res = self.ds.engine.execute_query(query)
        return res[0][0]

    def compute_total_repairs_grdt(self, attr=None):
        """
        compute_total_repairs_grdt memoizes the number of repairs for cells
        that are specified in the clean/ground truth data. Otherwise repairs
        are defined the same as compute_total_repairs.

        We also distinguish between repairs on correct cells and repairs on
        incorrect cells (correct cells are cells where init == ground truth).
        :param attr: if attr is not None, it must be categorical
        """

        assert attr is None or attr in self.ds.categorical_attrs
        if not self.ds.categorical_attrs:
            return 0.
        # do not query on numerical attrs (when attr is None, indicating we want query on all categorical attrs)
        # if there are no numerical attrs, then no condition should be added, just query on all attributes
        attr_clause = self.get_categorical_clause(
            attr) if self.ds.numerical_attrs else "TRUE"

        query = """
            SELECT
                (t1.init_value = ANY(string_to_array(regexp_replace(t3._value_,\'[{{\"\"}}]\',\'\',\'gi\'),\'|\'))) AS is_correct,
                count(*)
            FROM   "{}" as t1, "{}" as t2, "{}" as t3
            WHERE  t1._tid_ = t2._tid_
              AND  t1.attribute = t2.attribute
              AND  t1.init_value != t2.rv_value
              AND  t1._tid_ = t3._tid_
              AND  t1.attribute = t3._attribute_
              AND  {}
            GROUP BY is_correct
              """.format(AuxTables.cell_domain.name,
                         AuxTables.inf_values_dom.name, self.clean_data.name,
                         attr_clause)

        res = self.ds.engine.execute_query(query)

        # Memoize the number of repairs on correct cells and incorrect cells.
        # Since we do a GROUP BY we need to check which row of the result
        # corresponds to the correct/incorrect counts.
        total_repairs_grdt_correct, total_repairs_grdt_incorrect = 0, 0
        if not res:
            return 0, 0, 0

        if res[0][0]:
            correct_idx, incorrect_idx = 0, 1
        else:
            correct_idx, incorrect_idx = 1, 0
        if correct_idx < len(res):
            total_repairs_grdt_correct = float(res[correct_idx][1])
        if incorrect_idx < len(res):
            total_repairs_grdt_incorrect = float(res[incorrect_idx][1])
        total_repairs_grdt = total_repairs_grdt_correct + total_repairs_grdt_incorrect

        return total_repairs_grdt_correct, total_repairs_grdt_incorrect, total_repairs_grdt

    def compute_total_errors(self, attr=None):
        """
        compute_total_errors memoizes the number of cells that have a
        wrong initial value: requires ground truth data.
        :param attr: if attr is not None, it must be categorical
        """
        assert attr is None or attr in self.ds.categorical_attrs
        if not self.ds.categorical_attrs:
            return 0.

        queries = []
        total_errors = 0.0

        query_attrs = [attr] if attr else self.ds.categorical_attrs
        for attr in query_attrs:
            query = errors_template.substitute(
                init_table=self.ds.raw_data.name,
                grdt_table=self.clean_data.name,
                attr=attr)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)

        for i in range(len(results)):
            res = results[i]
            total_errors += float(res[0][0])

        return total_errors

    def compute_detected_errors(self, attr=None):
        """
        compute_detected_errors memoizes the number of error cells that
        were detected in error detection: requires ground truth.

        This value is always equal or less than total errors (see
        compute_total_errors).
        :param attr: if attr is not None, it must be categorical
        """
        assert attr is None or attr in self.ds.categorical_attrs
        if not self.ds.categorical_attrs:
            return 0.
        # do not query on numerical attrs (when attr is None, indicating we want query on all categorical attrs)
        # if there are no numerical attrs, then no condition should be added, just query on all attributes
        attr_clause = self.get_categorical_clause(
            attr) if self.ds.numerical_attrs else "TRUE"

        query = "SELECT count(*) FROM " \
                "  (SELECT _vid_ " \
                '   FROM "{}" as t1, "{}" as t2, "{}" as t3 ' \
                "   WHERE t1._tid_ = t2._tid_ " \
                "     AND t1._cid_ = t3._cid_ " \
                "     AND t1.attribute = t2._attribute_ " \
                "     AND NOT t1.init_value = ANY(string_to_array(regexp_replace(t2._value_,\'[{{\"\"}}]\',\'\',\'gi\'),\'|\')) " \
                "     AND {}) AS t".format(AuxTables.cell_domain.name,
                                           self.clean_data.name,
                                           AuxTables.dk_cells.name,
                                           attr_clause)
        res = self.ds.engine.execute_query(query)
        return float(res[0][0])

    def compute_correct_repairs(self, attr=None):
        """
        compute_correct_repairs memoizes the number of error cells
        that were correctly inferred.

        This value is always equal or less than total errors (see
        compute_total_errors).
        :param attr: if attr is not None, it must be categorical
        """
        assert attr is None or attr in self.ds.categorical_attrs
        if not self.ds.categorical_attrs:
            return 0.

        queries = []
        correct_repairs = 0.0

        query_attrs = [attr] if attr else self.ds.categorical_attrs
        for attr in query_attrs:
            query = correct_repairs_template.substitute(
                init_table=self.ds.raw_data.name,
                grdt_table=self.clean_data.name,
                attr=attr,
                inf_dom=AuxTables.inf_values_dom.name)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)

        for i in range(len(results)):
            res = results[i]
            correct_repairs += float(res[0][0])

        return correct_repairs

    def compute_rmse(self, attr=None):
        """
        Should check all the dk_cells in numerical attributes
        compute RMS error for all dk_cells in numerical attributes
        :return:
        """
        assert attr is None or attr in self.ds.numerical_attrs
        if not self.ds.numerical_attrs:
            return 0.

        query_attrs = [attr] if attr else self.ds.numerical_attrs
        query_attrs_str = ["\'{}\'".format(attr) for attr in query_attrs]
        query_attrs_sql = '(%s)' % ','.join(query_attrs_str)
        query = rmse_template.substitute(grdt_table=self.clean_data.name,
                                         inf_dom=AuxTables.inf_values_dom.name,
                                         attrs_list=query_attrs_sql)
        res = self.ds.engine.execute_query(query)

        return res[0][0]

    @staticmethod
    def compute_recall(correct_repairs, total_errors):
        """
        Computes the recall (# of correct repairs / # of total errors).
        """
        if total_errors == 0:
            return 0
        return correct_repairs / total_errors

    @staticmethod
    def compute_repairing_recall(correct_repairs, detected_errors):
        """
        Computes the _repairing_ recall (# of correct repairs / # of total
        _detected_ errors).
        """
        if detected_errors == 0:
            return 0
        return correct_repairs / detected_errors

    @staticmethod
    def compute_precision(correct_repairs, total_repairs_grdt):
        """
        Computes precision (# correct repairs / # of total repairs w/ ground truth)
        """
        if total_repairs_grdt == 0:
            return 0
        return correct_repairs / total_repairs_grdt

    @staticmethod
    def compute_f1(correct_repairs, total_errors, total_repairs_grdt):
        prec = EvalEngine.compute_precision(correct_repairs,
                                            total_repairs_grdt)
        rec = EvalEngine.compute_recall(correct_repairs, total_errors)
        if prec + rec == 0:
            f1 = 0
        else:
            f1 = 2 * (prec * rec) / (prec + rec)
        return f1

    @staticmethod
    def compute_repairing_f1(correct_repairs, detected_errors,
                             total_repairs_grdt):
        prec = EvalEngine.compute_precision(correct_repairs,
                                            total_repairs_grdt)
        rec = EvalEngine.compute_repairing_recall(correct_repairs,
                                                  detected_errors)
        if prec == 0 and rec == 0:
            f1 = 0
        else:
            f1 = 2 * (prec * rec) / (prec + rec)

        return f1

    def log_weak_label_stats(self):
        query = """
        select
            (t3._tid_ is NULL) as clean,
            (t1.fixed) as status,
            (t1.init_value =  ANY(string_to_array(regexp_replace(t2._value_,\'[{{\"\"}}]\',\'\',\'gi\'),\'|\'))) as init_eq_grdth,
            (t1.weak_label = ANY(string_to_array(regexp_replace(t2._value_,\'[{{\"\"}}]\',\'\',\'gi\'),\'|\'))) as wl_eq_grdth,
            (t1.weak_label = t4.rv_value) as wl_eq_infer,
            (t4.rv_value = ANY(string_to_array(regexp_replace(t2._value_,\'[{{\"\"}}]\',\'\',\'gi\'),\'|\'))) as infer_eq_grdth,
            count(*) as count
        from
            "{cell_domain}" as t1,
            "{clean_data}" as t2
            left join "{dk_cells}" as t3 on t2._tid_ = t3._tid_ and t2._attribute_ = t3.attribute
            left join "{inf_values_dom}" as t4 on t2._tid_ = t4._tid_ and t2._attribute_ = t4.attribute where t1._tid_ = t2._tid_ and t1.attribute = t2._attribute_
        group by
            clean,
            status,
            init_eq_grdth,
            wl_eq_grdth,
            wl_eq_infer,
            infer_eq_grdth
        """.format(cell_domain=AuxTables.cell_domain.name,
                   clean_data=self.clean_data.name,
                   dk_cells=AuxTables.dk_cells.name,
                   inf_values_dom=AuxTables.inf_values_dom.name)

        res = self.ds.engine.execute_query(query)

        df_stats = pd.DataFrame(res,
                                columns=[
                                    "is_clean", "cell_status", "init = grdth",
                                    "wlabel = grdth", "wlabel = infer",
                                    "infer = grdth", "count"
                                ])
        df_stats = df_stats.sort_values(list(
            df_stats.columns)).reset_index(drop=True)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', len(df_stats))
        pd.set_option('display.max_colwidth', -1)
        logging.debug(
            "weak label statistics: (cell_status: 0 - none, 1 - wlabelled, 2 - single value)\n%s",
            df_stats)
        pd.reset_option('display.max_columns')
        pd.reset_option('display.max_rows')
        pd.reset_option('display.max_colwidth')
Exemplo n.º 8
0
class EvalEngine:
    def __init__(self, env, dataset):
        self.env = env
        self.ds = dataset

    def load_data(self,
                  name,
                  fpath,
                  tid_col,
                  attr_col,
                  val_col,
                  na_values=None):
        tic = time.clock()
        try:
            raw_data = pd.read_csv(fpath,
                                   na_values=na_values,
                                   encoding='utf-8')
            # We drop any ground truth values that are NULLs since we follow
            # the closed-world assumption (if it's not there it's wrong).
            # TODO: revisit this once we allow users to specify which
            # attributes may be NULL.
            raw_data.dropna(subset=[val_col], inplace=True)
            raw_data.fillna(NULL_REPR, inplace=True)
            raw_data.rename(
                {
                    tid_col: '_tid_',
                    attr_col: '_attribute_',
                    val_col: '_value_'
                },
                axis='columns',
                inplace=True)
            raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
            # Normalize string to whitespaces.
            raw_data['_value_'] = raw_data['_value_'].str.strip().str.lower()
            self.clean_data = Table(name, Source.DF, df=raw_data)
            self.clean_data.store_to_db(self.ds.engine.engine)
            self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
            self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
            status = 'DONE Loading {fname}'.format(
                fname=os.path.basename(fpath))
        except Exception:
            logging.error('load_data for table %s', name)
            raise
        toc = time.clock()
        load_time = toc - tic
        return status, load_time

    def evaluate_repairs(self):
        self.compute_total_repairs()
        self.compute_total_repairs_grdt()
        self.compute_total_errors()
        self.compute_detected_errors()
        self.compute_correct_repairs()
        prec = self.compute_precision()
        rec = self.compute_recall()
        rep_recall = self.compute_repairing_recall()
        f1 = self.compute_f1()
        rep_f1 = self.compute_repairing_f1()

        if self.env['verbose']:
            self.log_weak_label_stats()

        return prec, rec, rep_recall, f1, rep_f1

    def eval_report(self):
        """
        Returns an EvalReport named tuple containing the experiment results.
        """
        tic = time.clock()
        try:
            prec, rec, rep_recall, f1, rep_f1 = self.evaluate_repairs()
            report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs on correct cells (Grdth present) = %d, Total Repairs on incorrect cells (Grdth present) = %d" % (
                prec, rec, rep_recall, f1, rep_f1, self.detected_errors,
                self.total_errors, self.correct_repairs, self.total_repairs,
                self.total_repairs_grdt_correct,
                self.total_repairs_grdt_incorrect)
            eval_report = EvalReport(prec, rec, rep_recall, f1, rep_f1,
                                     self.detected_errors, self.total_errors,
                                     self.correct_repairs, self.total_repairs,
                                     self.total_repairs_grdt,
                                     self.total_repairs_grdt_correct,
                                     self.total_repairs_grdt_incorrect)
        except Exception as e:
            logging.error("ERROR generating evaluation report %s" % e)
            raise

        toc = time.clock()
        report_time = toc - tic
        return report, report_time, eval_report

    def compute_total_repairs(self):
        """
        compute_total_repairs memoizes the number of repairs:
        the # of cells that were inferred and where the inferred value
        is not equal to the initial value.
        """

        query = "SELECT count(*) FROM " \
                "  (SELECT _vid_ " \
                "     FROM {} as t1, {} as t2 " \
                "    WHERE t1._tid_ = t2._tid_ " \
                "      AND t1.attribute = t2.attribute " \
                "      AND t1.init_value != t2.rv_value) AS t".format(AuxTables.cell_domain.name,
                                                                      AuxTables.inf_values_dom.name)
        res = self.ds.engine.execute_query(query)
        self.total_repairs = float(res[0][0])

    def compute_total_repairs_grdt(self):
        """
        compute_total_repairs_grdt memoizes the number of repairs for cells
        that are specified in the clean/ground truth data. Otherwise repairs
        are defined the same as compute_total_repairs.

        We also distinguish between repairs on correct cells and repairs on
        incorrect cells (correct cells are cells where init == ground truth).
        """
        query = """
        SELECT
            (t1.init_value = t3._value_) AS is_correct,
            count(*)
        FROM   {} as t1, {} as t2, {} as t3
        WHERE  t1._tid_ = t2._tid_
          AND  t1.attribute = t2.attribute
          AND  t1.init_value != t2.rv_value
          AND  t1._tid_ = t3._tid_
          AND  t1.attribute = t3._attribute_
        GROUP BY is_correct
          """.format(AuxTables.cell_domain.name, AuxTables.inf_values_dom.name,
                     self.clean_data.name)
        res = self.ds.engine.execute_query(query)

        # Memoize the number of repairs on correct cells and incorrect cells.
        # Since we do a GROUP BY we need to check which row of the result
        # corresponds to the correct/incorrect counts.
        self.total_repairs_grdt_correct, self.total_repairs_grdt_incorrect = 0, 0
        self.total_repairs_grdt = 0
        if not res:
            return

        if res[0][0]:
            correct_idx, incorrect_idx = 0, 1
        else:
            correct_idx, incorrect_idx = 1, 0
        if correct_idx < len(res):
            self.total_repairs_grdt_correct = float(res[correct_idx][1])
        if incorrect_idx < len(res):
            self.total_repairs_grdt_incorrect = float(res[incorrect_idx][1])
        self.total_repairs_grdt = self.total_repairs_grdt_correct + self.total_repairs_grdt_incorrect

    def compute_total_errors(self):
        """
        compute_total_errors memoizes the number of cells that have a
        wrong initial value: requires ground truth data.
        """
        queries = []
        total_errors = 0.0
        for attr in self.ds.get_attributes():
            query = errors_template.substitute(
                init_table=self.ds.raw_data.name,
                grdt_table=self.clean_data.name,
                attr=attr)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)
        for res in results:
            total_errors += float(res[0][0])
        self.total_errors = total_errors

    def compute_detected_errors(self):
        """
        compute_detected_errors memoizes the number of error cells that
        were detected in error detection: requires ground truth.

        This value is always equal or less than total errors (see
        compute_total_errors).
        """
        query = "SELECT count(*) FROM " \
                "  (SELECT _vid_ " \
                "   FROM   %s as t1, %s as t2, %s as t3 " \
                "   WHERE  t1._tid_ = t2._tid_ AND t1._cid_ = t3._cid_ " \
                "     AND  t1.attribute = t2._attribute_ " \
                "     AND  t1.init_value != t2._value_) AS t" \
                % (AuxTables.cell_domain.name, self.clean_data.name, AuxTables.dk_cells.name)
        res = self.ds.engine.execute_query(query)
        self.detected_errors = float(res[0][0])

    def compute_correct_repairs(self):
        """
        compute_correct_repairs memoizes the number of error cells
        that were correctly inferred.

        This value is always equal or less than total errors (see
        compute_total_errors).
        """
        queries = []
        correct_repairs = 0.0
        for attr in self.ds.get_attributes():
            query = correct_repairs_template.substitute(
                init_table=self.ds.raw_data.name,
                grdt_table=self.clean_data.name,
                attr=attr,
                inf_dom=AuxTables.inf_values_dom.name)
            queries.append(query)
        results = self.ds.engine.execute_queries(queries)
        for res in results:
            correct_repairs += float(res[0][0])
        self.correct_repairs = correct_repairs

    def compute_recall(self):
        """
        Computes the recall (# of correct repairs / # of total errors).
        """
        if self.total_errors == 0:
            return 0
        return self.correct_repairs / self.total_errors

    def compute_repairing_recall(self):
        """
        Computes the _repairing_ recall (# of correct repairs / # of total
        _detected_ errors).
        """
        if self.detected_errors == 0:
            return 0
        return self.correct_repairs / self.detected_errors

    def compute_precision(self):
        """
        Computes precision (# correct repairs / # of total repairs w/ ground truth)
        """
        if self.total_repairs_grdt == 0:
            return 0
        return self.correct_repairs / self.total_repairs_grdt

    def compute_f1(self):
        prec = self.compute_precision()
        rec = self.compute_recall()
        if prec + rec == 0:
            return 0
        f1 = 2 * (prec * rec) / (prec + rec)
        return f1

    def compute_repairing_f1(self):
        prec = self.compute_precision()
        rec = self.compute_repairing_recall()
        if prec + rec == 0:
            return 0
        f1 = 2 * (prec * rec) / (prec + rec)
        return f1

    def log_weak_label_stats(self):
        query = """
        select
            (t3._tid_ is NULL) as clean,
            (t1.fixed) as status,
            (t4._tid_ is NOT NULL) as inferred,
            (t1.init_value = t2._value_) as init_eq_grdth,
            (t1.init_value = t4.rv_value) as init_eq_infer,
            (t1.weak_label = t1.init_value) as wl_eq_init,
            (t1.weak_label = t2._value_) as wl_eq_grdth,
            (t1.weak_label = t4.rv_value) as wl_eq_infer,
            (t2._value_ = t4.rv_value) as infer_eq_grdth,
            count(*) as count
        from
            {cell_domain} as t1,
            {clean_data} as t2
            left join {dk_cells} as t3 on t2._tid_ = t3._tid_ and t2._attribute_ = t3.attribute
            left join {inf_values_dom} as t4 on t2._tid_ = t4._tid_ and t2._attribute_ = t4.attribute where t1._tid_ = t2._tid_ and t1.attribute = t2._attribute_
        group by
            clean,
            status,
            inferred,
            init_eq_grdth,
            init_eq_infer,
            wl_eq_init,
            wl_eq_grdth,
            wl_eq_infer,
            infer_eq_grdth
        """.format(cell_domain=AuxTables.cell_domain.name,
                   clean_data=self.clean_data.name,
                   dk_cells=AuxTables.dk_cells.name,
                   inf_values_dom=AuxTables.inf_values_dom.name)

        res = self.ds.engine.execute_query(query)

        df_stats = pd.DataFrame(res,
                                columns=[
                                    "is_clean", "cell_status", "is_inferred",
                                    "init = grdth", "init = inferred",
                                    "w. label = init", "w. label = grdth",
                                    "w. label = inferred", "infer = grdth",
                                    "count"
                                ])
        df_stats = df_stats.sort_values(list(
            df_stats.columns)).reset_index(drop=True)
        logging.debug("weak label statistics:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', len(df_stats))
        pd.set_option('display.max_colwidth', -1)
        logging.debug("%s", df_stats)
        pd.reset_option('display.max_columns')
        pd.reset_option('display.max_rows')
        pd.reset_option('display.max_colwidth')