def run(self):
        schematches = SchemaMatch()

        dal = DALMongo(self.project_id)

        # se obtienen las columnas originales
        schema1 = {c.name: c for c in dal.get_schema(1)}
        schema2 = {c.name: c for c in dal.get_schema(2)}

        # se crea un obj SchemaMatch con los pares de columans elegidos
        for match in self.matches:
            cols1 = [schema1[col_name] for col_name in match['source1']]
            cols2 = [schema2[col_name] for col_name in match['source2']]

            schematches.add_match(cols1, cols2, match['custom_name'])

        # Schemas are standardised
        self.records1 = self._standardise_schema(self.records1, schematches, 1,
                                                 schema2)
        self.records2 = self._standardise_schema(self.records2, schematches, 2,
                                                 schema1)

        # Create the global schema
        # taking one record and getting the matched schema will be enough
        for col_name, col_obj in self.records1[0].columns.items():
            if col_name.startswith("__new__") or self.remaining_columns:
                self.add_to_schema(
                    Column(col_name, [], col_obj.type, col_obj.is_new,
                           col_obj.custom_name), self.project_id)
        return self.schema, self.records1, self.records2
Exemplo n.º 2
0
    def run(self):
        dal = DALMongo(self.project_id)

        fused_records = []
        for match in self.matches:
            # se obtienen los registros matcheados
            [r1, r2] = dal.get_match_pair(match)

            # se extraen las columnas que no estan matcheadas
            r1_remaining_cols = r1.get_sourcex_cols(1)
            r2_remaining_cols = r2.get_sourcex_cols(2)

            # se crea un record con las columnas no matcheadas
            r3 = Record(id=match._id)
            r3.add_columns(r1_remaining_cols)
            r3.add_columns(r2_remaining_cols)

            # se agregan las columnas matcheadas de acuerdo al criterio
            preferred_record = r1 if self.preferred_source == '1' else r2
            for col in preferred_record.get_new_cols():
                r3.add_column(col)

            fused_records.append(r3)

        return fused_records
    def _clean_source(self, source_number):
        # Se obtienen los registros
        dal = DALMongo(self.project_id)
        records = dal.get_records(ExtractionStep().class_name, source_number)

        # Make a list with columns specified by the user
        # used_cols = []
        # for col, datacleansing_modules in self.config["source{}".format(source_number)].items():
        #     if col not in used_cols:
        #         used_cols.append(col)
        #
        # all_cols = [col_obj.name for col_obj in dal.get_schema(source_number, 'ExtractionStep')]
        # extra_cols = [col for col in all_cols if col not in used_cols]

        #  Do cleansing for each column of each record
        for record in records:
            for col, datacleansing_modules in self.config["source{}".format(
                    source_number)].items():
                for datacleansing_module in datacleansing_modules:
                    module = self._load_module(datacleansing_module)
                    # TODO the module should be given only the field value (string) and not the column
                    record.columns[col] = module.run(record.columns[col])

                    # Remove extra columns
                    # for extra_col in extra_cols:
                    #     record.columns.pop(extra_col)

        self._append_result_collection(
            records, "source{}_records".format(source_number))
Exemplo n.º 4
0
    def add_to_schema(self, column, project_id):
        """
        Adds the column to the new schema if it doesnt already exists with the same name.
        If segmentation was applied then the union of the output fields of the matched columns are also included
        """
        if len([c for c in self.schema if c.name == column.name]) > 0:
            return

        dal_mongo = DALMongo(project_id)
        mongoclient = dal_mongo.get_mongoclient()
        db = mongoclient["project{}".format(project_id)]
        coll1 = db["SegmentationStep_source1_schema"]
        coll2 = db["SegmentationStep_source2_schema"]

        if column.name.startswith("__new__"):

            # TODO this is done assumming that the format is '__new__cols1-...-__cols2-...'
            # example: column_name = __new__name-surname__nombreyapellido
            matched_columns_s1 = column.name.split('__')[2].split(
                '-')  # ['name', 'surname']
            matched_columns_s2 = column.name.split('__')[3].split(
                '-')  # ['nombreyapellido']

            ofs1 = []
            ofs1_type = {}
            for col1 in matched_columns_s1:
                docs = coll1.find({'fields': {'$ne': []}, 'name': col1})
                for d in docs:
                    for field in d['fields']:
                        f = field['output_field']
                        if f not in ofs1:
                            ofs1.append(f)
                            ofs1_type[f] = field['type']

            ofs2 = []
            ofs2_type = {}
            for col2 in matched_columns_s2:
                docs = coll2.find({'fields': {'$ne': []}, 'name': col2})
                for d in docs:
                    for field in d['fields']:
                        f = field['output_field']
                        if f not in ofs1:
                            ofs2.append(f)
                            ofs2_type[f] = field['type']

            union_output_fields = list(OrderedSet(ofs1 + ofs2))
            union_output_fields_type = ofs1_type.copy()
            union_output_fields_type.update(ofs2_type)

            for of in union_output_fields:
                new_of = Field(
                    tags=[],
                    output_field=of,
                    value="n/A",
                    tipe=EnumType(
                        union_output_fields_type[of]))  # type of s1 and s2
                # should be the same
                column.fields.append(new_of)

        self.schema.append(deepcopy(column))
    def _get_groups(self, source_number):
        dal = DALMongo(self.project_id)

        records = dal.get_records(SchemaMatchingStep().class_name,
                                  source_number)
        module = self._load_module(records=records)
        return module.run()
    def run_implementation(self):
        # Se obtienen los resultados de la comparación
        dal = DALMongo(self.project_id)

        matches = dal.get_matches()

        module = self._load_module(project_id=self.project_id, matches=matches)
        fused_records = module.run()

        self._append_result_collection(fused_records)
Exemplo n.º 7
0
    def __init__(self, project_id, config, **kwargs):
        super(RuleBasedClassification, self).__init__(**kwargs)

        # Si no hay una funcion de reducciond e vector definida, se asigna la de promedio
        if 'vector_reducer' not in self.config:
            self.config['vector_reducer'] = 'average'

        self.compute_similarity = getattr(self, "_vector_" + self.config['vector_reducer'])
        self.project_id = project_id
        self.logical_operator = int(config['logical-op'])
        self.rules = config['rules']
        self.dal = DALMongo(self.project_id)
    def run_implementation(self):
        # Se obtienen los resultados del data fusion
        dal = DALMongo(self.project_id)

        records = dal.get_fused_records()

        if not self.only_matches:
            records += dal.get_non_matches()

        schema = dal.get_global_schema()

        return self._load_module(records=records, schema=schema).run()
    def run_implementation(self):
        # Se obtienen los vectores de similitud
        dal = DALMongo(self.project_id)

        simils = dal.get_similarity_vectors()

        match_results = []
        module = self._load_module(project_id=self.project_id)

        for simil in simils:
            match_results.append(module.run(simil))

        self._append_result_collection(match_results)
    def run(self):
        """
        Run generico. ejecuta cosas previas, ejecuta el step, y ejecuta cosas posteriores
        """
        logging.info("Starting step " + self.class_name)

        ret = self.run_implementation()

        # se guardan los resultados
        dal = DALMongo(self.project_id)
        dal.store_step_results(step=self.class_name, results=self.results)

        logging.info("Finished step " + self.class_name)
        return ret
    def _standardise_and_tag_source(self, source_number):
        # Get cleansed records from MongoDB
        dal = DALMongo(self.project_id)
        records = dal.get_records(DataCleansingStep().class_name,
                                  source_number)

        # Run standardisation and tagging module for each column of each record
        for record in records:
            for col, standardisation_tagging_module in self.config[
                    "source{}".format(source_number)].items():
                module = self._load_module(standardisation_tagging_module)
                record.columns[col] = module.run(record.columns[col])

        self._append_result_collection(
            records, "source{}_records".format(source_number))
    def _segment_source(self, source_number):
        dal = DALMongo(self.project_id)

        records = dal.get_records(StandardisationAndTaggingStep().class_name,
                                  source_number)
        # module = self._load_module(records=records)

        # Initialize columns to store new segmented schema
        orig_schema = {}
        for c_obj in dal.get_schema(source_number):
            orig_schema[c_obj.name] = c_obj

        new_cols = orig_schema

        # Run segmentation module for each column of each record
        for record in records:
            for col_name, segmentation_module in self.config["source{}".format(
                    source_number)].items():
                module = self._load_module(segmentation_module)
                record.columns[col_name] = module.run(record.columns[col_name])

                # This is to create the new segmented schema
                for field_obj in record.columns[col_name].fields:
                    new_col_fields = new_cols[col_name].fields
                    # If a new output field was found in this column then add it to the new schema
                    if field_obj.output_field is not None and \
                            field_obj.output_field not in [field.output_field for field in new_col_fields]:
                        # TODO tags could be appended as well but for now we leave it empty
                        new_of = Field(value="n/A",
                                       tipe=field_obj.tipe,
                                       output_field=field_obj.output_field,
                                       tags=[])
                        new_cols[col_name].fields.append(new_of)

        # Reconstruct new_cols object so that the DAL can store it
        segmented_schema = []
        for col_name, col_obj in new_cols.items():
            segmented_schema.append(col_obj)

        self._append_result_collection(
            records, 'source{}_records'.format(source_number))
        self._append_result_collection(segmented_schema,
                                       'source{}_schema'.format(source_number))
    def config_json(project_id):
        dal = DALMongo(project_id)

        cols = [{
            "label": c['custom_name'],
            "value": c['name'],
            "id": c['name'],
            "config": {
                "key": {
                    'type': 'hidden',
                    'value': c['name'],
                }
            }
        } for c in dal.get_global_schema() if c['name'].startswith('__new__')]
        # Above checking of __new__ prefix is unncessary I think...

        encoding_configs = dynamic_loading.list_modules('encoding')

        rowmodel = {
            'type': 'row',
            'cols': {
                '1_key': {
                    'type': 'dropdown',
                    'label': 'Select a column',
                    'selectedoption': {},
                    'options': cols
                },
                'encoding': {
                    "type": "dropdown",
                    'label': 'Select encoding',
                    'selectedoption': {},
                    'options': encoding_configs
                }
            }
        }
        return {
            'keys': {
                'type': 'rows',
                'rows': [],
                'label': 'Keys',
                "rowmodel": rowmodel
            }
        }
    def config_json(project_id):
        dal = DALMongo(project_id)

        cols1 = [c.name for c in dal.get_schema(1)]
        cols2 = [c.name for c in dal.get_schema(2)]

        rowmodel = {
            'type': 'row',
            'cols': {
                'source1': {
                    'label': 'Select source 1 columns',
                    'type': 'multipleselect',
                    'options': cols1
                },
                'source2': {
                    'label': 'Select source 2 columns',
                    'type': 'multipleselect',
                    'options': cols2
                },
                'custom_name': {
                    'label': 'New column name',
                    'type': 'text'
                }
            }
        }

        return {
            'matches': {
                'type': 'rows',
                'rows': [],
                'label': 'Matches',
                "rowmodel": rowmodel
            },
            'remaining_columns': {
                'label': 'Add remaining columns to the final schema',
                'type': 'toggleswitch',
                "color": 'blue',
                'checked': False
            },
        }
    def run_implementation(self):
        """
        Firma del run particular de cada step
        Implementación por defecto
        """
        dal = DALMongo(self.project_id)
        if self.segmentation_skipped:
            dal.drop_segmentation()
            prevstep = "StandardisationAndTaggingStep"
        else:
            prevstep = "SegmentationStep"
        records1 = dal.get_records(prevstep, 1)
        records2 = dal.get_records(prevstep, 2)

        module = self._load_module(project_id=self.project_id,
                                   records1=records1,
                                   records2=records2)

        new_schema, records1, records2 = module.run()

        self._append_result_collection(records1, 'source1_records')
        self._append_result_collection(records2, 'source2_records')
        self._append_result_collection(new_schema, 'new_schema')
Exemplo n.º 16
0
        }
    },
    "export": {
        "selected_module": {
            "name": "mongodb",
            "config": {
                'host': "localhost",
                'port': 27017,
                'db': "base",
                'collection': "coso"
            }
        }
    }
}

dal = DALMongo(project_id)
dal.drop_database()

w.set_current_step("ExtractionStep", config["extraction"])
w.execute_step()

w.set_current_step("StandardizationStep", config["standardization"])
w.execute_step()

w.set_current_step("SegmentationStep", config["segmentation"])
w.execute_step()

w.set_current_step("SchemaMatchingStep", config["schema-matching"])
w.execute_step()

w.set_current_step("IndexingStep", config["indexing1"])
Exemplo n.º 17
0
    def config_json(project_id):
        # Se cargan las funciones de reduccion del vector
        # vector_reducers = []
        # for func in dir(RuleBasedClassification):
        #     m = re.search('_vector_(.+)', func)
        #     if m:
        #         vector_reducers.append(m.group(1))

        dal = DALMongo(project_id)
        project = Project.objects.get(id=project_id)

        if project.segmentation_skipped:
            cols = [{
                        "label": c['name'],
                        "config": {
                            "val": {
                                'type': 'hidden',
                                'value': c['name'],
                            }
                        }
                    } for c in dal.get_matched_cols()]
        else:
            cols = [{
                        "label": c['name'],
                        "config": {
                            "val": {
                                'type': 'hidden',
                                'value': c['name'],
                            }
                        }
                    } for c in dal.get_output_fields_matched_cols()]

        rowmodel = {
            'type': 'row',
            'cols': {
                '1_output-field-column': {
                    'label': 'Column/Output Field',
                    'type': 'dropdown',
                    'selectedoption': {},
                    'options': cols
                },
                'logical-op': {
                    'label': 'Operator',
                    'type': 'dropdown',
                    'selectedoption': {},
                    'options': [
                        {
                            'label': 'Greater than',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 0
                                }
                            }
                        },
                        {
                            'label': 'Less than',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 1
                                }
                            }
                        },
                        {
                            'label': 'Equal to',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 2
                                }
                            }
                        },
                        {
                            'label': 'Greater than or equal to',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 3
                                }
                            }
                        },
                        {
                            'label': 'Less than or equal to',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 4
                                }
                            }
                        }
                    ]
                },
                'value': {
                    "label": "Value",
                    "type": "slider",
                    "start": "0",
                    "end": "1",
                    "step": 0.01,
                    "color": "amber"
                }
            }
        }

        return {
            'rules': {
                'type': 'rows',
                'rows': [],
                'label': 'Rules',
                "rowmodel": rowmodel
            },
            'logical-op': {
                'label': 'Logical operator between rules',
                'type': 'radioinline',
                'options': [
                    {
                        'label': 'AND',
                        'value': 1
                    },
                    {
                        'label': 'OR',
                        'value': 0
                    }
                ]
            }
        }
Exemplo n.º 18
0
class RuleBasedClassification(ClassificationModule):
    """
        Classifies matches based on logical rules applied on the individual compared
        output fields and the total score. Logical operators allowed are AND and OR.

        Formato config:
        {
            rules:[
                '0': {

                },
                '1': {

                }
                ...
            ],
            vector_reducer: <reduce function>
        }
    """

    def __init__(self, project_id, config, **kwargs):
        super(RuleBasedClassification, self).__init__(**kwargs)

        # Si no hay una funcion de reducciond e vector definida, se asigna la de promedio
        if 'vector_reducer' not in self.config:
            self.config['vector_reducer'] = 'average'

        self.compute_similarity = getattr(self, "_vector_" + self.config['vector_reducer'])
        self.project_id = project_id
        self.logical_operator = int(config['logical-op'])
        self.rules = config['rules']
        self.dal = DALMongo(self.project_id)

    @staticmethod
    def pretty_name():
        return "Rule-based classification"

    def run(self, simil):
        #similarity = self.compute_similarity(simil.vector)

        vector = simil.vector
        match_type = MatchResultType.undetermined

        # Given the fact that the simil vector is sorted I must obtain the columns/ofs again from the DAL because
        # the user can send the rules per column/of in any order
        project = Project.objects.get(id=self.project_id)

        cols_order = {}
        if project.segmentation_skipped:
            for idx, c in enumerate(self.dal.get_matched_cols()):
                cols_order[c['name']] = idx
        else:
            for idx, c in enumerate(self.dal.get_output_fields_matched_cols()):
                cols_order[c['name']] = idx


        rules_logical_op = self.logical_operator

        # Initialization of rules total evaluation
        if rules_logical_op == 1:  # apply AND
            rules_evaluation = True
        elif rules_logical_op == 0:
            rules_evaluation = False

        for rule in self.rules:
            col_or_outf_to_compare = rule['1_output-field-column']['val']
            idx_col_or_outf_to_compare = cols_order[col_or_outf_to_compare]  # index of the simil vector to compare
            logical_op = rule['logical-op']['val']

            if rules_logical_op == 1: # apply AND
                if logical_op == 0: # greater than
                    rules_evaluation = rules_evaluation and rule['value'] < vector[idx_col_or_outf_to_compare]
                elif logical_op == 1:  # less than
                    rules_evaluation = rules_evaluation and rule['value'] > vector[idx_col_or_outf_to_compare]
                elif logical_op == 2:  # equal
                    rules_evaluation = rules_evaluation and rule['value'] == vector[idx_col_or_outf_to_compare]
                elif logical_op == 3:  # equal or greater than
                    rules_evaluation = rules_evaluation and rule['value'] <= vector[idx_col_or_outf_to_compare]
                elif logical_op == 4:  # equal or less than
                    rules_evaluation = rules_evaluation and rule['value'] >= vector[idx_col_or_outf_to_compare]

            elif rules_logical_op == 0: # apply or
                if logical_op == 0: # greater than
                    rules_evaluation = rules_evaluation or rule['value'] < vector[idx_col_or_outf_to_compare]
                elif logical_op == 1:  # less than
                    rules_evaluation = rules_evaluation or rule['value'] > vector[idx_col_or_outf_to_compare]
                elif logical_op == 2:  # equal
                    rules_evaluation = rules_evaluation or rule['value'] == vector[idx_col_or_outf_to_compare]
                elif logical_op == 3:  # equal or greater than
                    rules_evaluation = rules_evaluation or rule['value'] <= vector[idx_col_or_outf_to_compare]
                elif logical_op == 4:  # equal or less than
                    rules_evaluation = rules_evaluation or rule['value'] >= vector[idx_col_or_outf_to_compare]

        match_type = MatchResultType.match if rules_evaluation else MatchResultType.no_match

        return MatchResult(simil.record1, simil.record2, match_type)

    @staticmethod
    def _vector_average(vector):
        return sum(vector) / len(vector)

    @staticmethod
    def config_json(project_id):
        # Se cargan las funciones de reduccion del vector
        # vector_reducers = []
        # for func in dir(RuleBasedClassification):
        #     m = re.search('_vector_(.+)', func)
        #     if m:
        #         vector_reducers.append(m.group(1))

        dal = DALMongo(project_id)
        project = Project.objects.get(id=project_id)

        if project.segmentation_skipped:
            cols = [{
                        "label": c['name'],
                        "config": {
                            "val": {
                                'type': 'hidden',
                                'value': c['name'],
                            }
                        }
                    } for c in dal.get_matched_cols()]
        else:
            cols = [{
                        "label": c['name'],
                        "config": {
                            "val": {
                                'type': 'hidden',
                                'value': c['name'],
                            }
                        }
                    } for c in dal.get_output_fields_matched_cols()]

        rowmodel = {
            'type': 'row',
            'cols': {
                '1_output-field-column': {
                    'label': 'Column/Output Field',
                    'type': 'dropdown',
                    'selectedoption': {},
                    'options': cols
                },
                'logical-op': {
                    'label': 'Operator',
                    'type': 'dropdown',
                    'selectedoption': {},
                    'options': [
                        {
                            'label': 'Greater than',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 0
                                }
                            }
                        },
                        {
                            'label': 'Less than',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 1
                                }
                            }
                        },
                        {
                            'label': 'Equal to',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 2
                                }
                            }
                        },
                        {
                            'label': 'Greater than or equal to',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 3
                                }
                            }
                        },
                        {
                            'label': 'Less than or equal to',
                            'config': {
                                "val": {
                                    'type': 'hidden',
                                    'value': 4
                                }
                            }
                        }
                    ]
                },
                'value': {
                    "label": "Value",
                    "type": "slider",
                    "start": "0",
                    "end": "1",
                    "step": 0.01,
                    "color": "amber"
                }
            }
        }

        return {
            'rules': {
                'type': 'rows',
                'rows': [],
                'label': 'Rules',
                "rowmodel": rowmodel
            },
            'logical-op': {
                'label': 'Logical operator between rules',
                'type': 'radioinline',
                'options': [
                    {
                        'label': 'AND',
                        'value': 1
                    },
                    {
                        'label': 'OR',
                        'value': 0
                    }
                ]
            }
        }
    def run_implementation(self):
        # Se obtienen los grupos de registros

        dal = DALMongo(self.project_id)

        groups = dal.get_indexing_groups()
        segmented_schema = dal.get_global_schema()
        output_fields_schema = {}
        matched_cols = []
        for column in segmented_schema:
            if column['name'].startswith("__new__"):
                output_fields_schema[column['name']] = column['fields']
                matched_cols.append(column['name'])
        simils = []

        max_weight = max(
            [float(module['weight']) for idx, module in self.config.items()])

        for group in groups:
            for r1 in group.records1:
                for r2 in group.records2:
                    # Initialize similarity vector
                    sv = SimilarityVector(r1._id, r2._id, group=group.key)
                    for col in matched_cols:  # could be r2.matched_cols() as well (they return the same)
                        if not self.segmentation_skipped:
                            for out_field, comparison_module in self.config.items(
                            ):
                                # Check that the output field exists in the column, otherwise it wont create an entrance
                                # in the similarity vector
                                if out_field in [
                                        f['output_field']
                                        for f in output_fields_schema[col]
                                ]:
                                    # Se obienen los valores a comparar y se comparan
                                    out_field_value1 = r1.get_output_field_col(
                                        out_field, col)
                                    out_field_value2 = r2.get_output_field_col(
                                        out_field, col)

                                    module = self._load_module(
                                        comparison_module)

                                    weight = float(comparison_module['weight'])

                                    # Actualiza el valor de la comparacion en el vector
                                    sim_value = module.run(
                                        out_field_value1, out_field_value2)
                                    sim_value_weighted = sim_value * weight / max_weight
                                    sv.vector.append(sim_value_weighted)
                                    # sv.comparisons.append([out_field_value1, out_field_value2])
                                    sv.comparisons.append({
                                        'values':
                                        [out_field_value1, out_field_value2],
                                        'output_field':
                                        out_field
                                    })
                        else:
                            comparison_module = self.config[col]

                            # Se obienen los valores completos de la columna
                            column_value_s1 = r1.get_field_col(col)
                            column_value_s2 = r2.get_field_col(col)

                            module = self._load_module(comparison_module)

                            weight = float(comparison_module['weight'])

                            # Actualiza el valor de la comparacion en el vector
                            sim_value = module.run(column_value_s1,
                                                   column_value_s2)
                            sim_value_weighted = sim_value * weight / max_weight
                            sv.vector.append(sim_value_weighted)
                            sv.comparisons.append(
                                {'values': [column_value_s1, column_value_s2]})
                    simils.append(sv)

        self._append_result_collection(simils)