def _get_groups(self, source_number): dal = DALMongo(self.project_id) records = dal.get_records(SchemaMatchingStep().class_name, source_number) module = self._load_module(records=records) return module.run()
def _clean_source(self, source_number): # Se obtienen los registros dal = DALMongo(self.project_id) records = dal.get_records(ExtractionStep().class_name, source_number) # Make a list with columns specified by the user # used_cols = [] # for col, datacleansing_modules in self.config["source{}".format(source_number)].items(): # if col not in used_cols: # used_cols.append(col) # # all_cols = [col_obj.name for col_obj in dal.get_schema(source_number, 'ExtractionStep')] # extra_cols = [col for col in all_cols if col not in used_cols] # Do cleansing for each column of each record for record in records: for col, datacleansing_modules in self.config["source{}".format( source_number)].items(): for datacleansing_module in datacleansing_modules: module = self._load_module(datacleansing_module) # TODO the module should be given only the field value (string) and not the column record.columns[col] = module.run(record.columns[col]) # Remove extra columns # for extra_col in extra_cols: # record.columns.pop(extra_col) self._append_result_collection( records, "source{}_records".format(source_number))
def _standardise_and_tag_source(self, source_number): # Get cleansed records from MongoDB dal = DALMongo(self.project_id) records = dal.get_records(DataCleansingStep().class_name, source_number) # Run standardisation and tagging module for each column of each record for record in records: for col, standardisation_tagging_module in self.config[ "source{}".format(source_number)].items(): module = self._load_module(standardisation_tagging_module) record.columns[col] = module.run(record.columns[col]) self._append_result_collection( records, "source{}_records".format(source_number))
def _segment_source(self, source_number): dal = DALMongo(self.project_id) records = dal.get_records(StandardisationAndTaggingStep().class_name, source_number) # module = self._load_module(records=records) # Initialize columns to store new segmented schema orig_schema = {} for c_obj in dal.get_schema(source_number): orig_schema[c_obj.name] = c_obj new_cols = orig_schema # Run segmentation module for each column of each record for record in records: for col_name, segmentation_module in self.config["source{}".format( source_number)].items(): module = self._load_module(segmentation_module) record.columns[col_name] = module.run(record.columns[col_name]) # This is to create the new segmented schema for field_obj in record.columns[col_name].fields: new_col_fields = new_cols[col_name].fields # If a new output field was found in this column then add it to the new schema if field_obj.output_field is not None and \ field_obj.output_field not in [field.output_field for field in new_col_fields]: # TODO tags could be appended as well but for now we leave it empty new_of = Field(value="n/A", tipe=field_obj.tipe, output_field=field_obj.output_field, tags=[]) new_cols[col_name].fields.append(new_of) # Reconstruct new_cols object so that the DAL can store it segmented_schema = [] for col_name, col_obj in new_cols.items(): segmented_schema.append(col_obj) self._append_result_collection( records, 'source{}_records'.format(source_number)) self._append_result_collection(segmented_schema, 'source{}_schema'.format(source_number))
def run_implementation(self): """ Firma del run particular de cada step Implementación por defecto """ dal = DALMongo(self.project_id) if self.segmentation_skipped: dal.drop_segmentation() prevstep = "StandardisationAndTaggingStep" else: prevstep = "SegmentationStep" records1 = dal.get_records(prevstep, 1) records2 = dal.get_records(prevstep, 2) module = self._load_module(project_id=self.project_id, records1=records1, records2=records2) new_schema, records1, records2 = module.run() self._append_result_collection(records1, 'source1_records') self._append_result_collection(records2, 'source2_records') self._append_result_collection(new_schema, 'new_schema')