def find_sources( *args: Union[str, ScanSource, List], data_sources: List[SourceConfig], column_report_filename: str = None, file_source: ExcelFileSource = ExcelFileSource() ) -> List[SheetWithHeaders]: """ Search the filesystem for sources and try to automatically discoverer tables and match columns. :param args: Files to scan - can include wildcard characters (glob patterns) :param data_sources: Configuration for tables and columns :param column_report_filename: Optional generation of a report summarising matches. This can be edited and fed back into :func:`~fddc.annex_a.merger.read_sources` function. :param file_source: :return: discovered sources """ input_files = __to_scan_source(args) # First we scan the input files section for all inputs files: List[FileSource] = [] for scan_source in input_files: files += file_scanner.find_input_files(scan_source) logger.info("Found {} candidate input files".format(len(files))) # We then scan the input files for data sources file_sources: List[WorkSheetDetail] = [] for file in files: file_sources += workbook_util.find_worksheets(file, file_source=file_source) logger.info("Found {} candidate data sources".format(len(file_sources))) # Match datasources based on configuration matched_sheets, unmatched_sheets = matcher.match_data_sources( file_sources, data_sources) # Match headers to column configuration sheet_with_columns: List[SheetWithHeaders] = matcher.match_columns( matched_sheets) # Write column report if column_report_filename is not None: matcher_report.column_report(sheet_with_columns, unmatched_sheets, column_report_filename) return sheet_with_columns
def test_column_report(self): sheet = self._get_test_sheet() result_sheet_list = matcher.match_columns([sheet]) report = fddc.annex_a.merger.matcher_report.column_report( result_sheet_list) self.assertEqual([ 'filename', 'sort_key', 'header_starts', 'sheetname', 'table', 'column_name', 'header_name', ], report.columns.tolist()) self.assertEqual(report.column_name.tolist(), ['Header 1', 'Header X', 'Header Y', np.nan]) self.assertEqual(report.header_name.tolist(), ['Header 1', 'Header X', '', 'Header T'])
def process_report(match_input: Union[Iterable[MatchInput], pd.DataFrame, str], data_sources: List[SourceConfig]): if isinstance(match_input, str) or isinstance(match_input, pd.DataFrame): match_input = parse_report(match_input) files_to_scan = set() columns_per_table = dict() mapping_dict: Dict[Tuple, List[MatchInput]] = dict() for input in match_input: if input.sheetname is None: # First we look for unscanned files sort_key = None if input.sort_key is not None: sort_key = [f'/.*/{input.sort_key}/'] scan_source = ScanSource(include=input.filename, sort_keys=sort_key) files = file_scanner.find_input_files(scan_source) files_to_scan.update(files) else: # Then we build a lookup of files and tables to see if any tables have no columns listed key = (input.filename, input.sort_key, input.sheetname, input.table) columns = columns_per_table.setdefault(key, set()) if input.column_name is not None: columns.add(input.column_name) if input.column_name is not None and input.header_name is not None: key = (input.filename, input.sort_key, input.sheetname, input.table) mapping_dict.setdefault(key, []).append(input) matched_list: List[MatchedSheet] = [] unmatched_list: List[WorkSheetDetail] = [] for file in files_to_scan: worksheets = workbook_util.find_worksheets(file) # Match datasources based on configuration matched, unmatched = matcher.match_data_sources( worksheets, data_sources) matched_list += matched unmatched_list += unmatched for key, columns in columns_per_table.items(): if len(columns) == 0: file, sort_key, sheetname, table = key if table is not None: worksheet_list = workbook_util.find_worksheets( FileSource(file, sort_key=sort_key)) worksheet = next( iter([ w for w in worksheet_list if w.sheetname == sheetname ])) source_config = next( iter([d for d in data_sources if d.name == table])) matched = MatchedSheet(sheet_detail=worksheet, source_config=source_config) matched_list.append(matched) # Match headers to column configuration sheet_with_headers: List[SheetWithHeaders] = matcher.match_columns( matched_list) for key, mapping_list in mapping_dict.items(): file, sort_key, sheetname, table = key worksheet_list = workbook_util.find_worksheets( FileSource(file, sort_key=sort_key)) sheet_detail = next( iter([w for w in worksheet_list if w.sheetname == sheetname])) source_config = next(iter([d for d in data_sources if d.name == table])) sheet = MatchedSheet(sheet_detail=sheet_detail, source_config=source_config) column_list: List[MatchedColumn] = [] for mapping in mapping_list: column_config = next( iter([ c for c in source_config.columns if c.name == mapping.column_name ])) header_config = next( iter([ h for h in sheet_detail.headers if h.value == mapping.header_name ])) column = MatchedColumn(column=column_config, header=header_config) column_list.append(column) sheet_with_headers.append( SheetWithHeaders(sheet=sheet, columns=column_list, unmatched_columns=[])) return sheet_with_headers, unmatched_list
def test_match_multiple_column(self): sheet = self._get_test_sheet() result_sheet_list = matcher.match_columns([sheet]) self.assertEqual(len(result_sheet_list), 1) self.assert_sheet(result_sheet_list[0], sheet)